diff options
author | Hao Chen <chenhao@loongson.cn> | 2021-12-20 19:57:26 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2022-01-21 01:34:38 +0000 |
commit | de8ae8c679f5a42fb9f9f65318d6cb95112180d6 (patch) | |
tree | 4f504ae4587084990aa39a10f820591f40ff30ed | |
parent | 51de1e16f20bb93468d7c538629b40ece8420b71 (diff) | |
download | libyuv-de8ae8c679f5a42fb9f9f65318d6cb95112180d6.tar.gz |
Add optimization functions in row_lasx.cc file.
Optimize 32 functions in source/row_lasx.cc file.
All test cases passed on loongarch platform.
Bug: libyuv:912
Signed-off-by: Hao Chen <chenhao@loongson.cn>
Change-Id: I7d3f649f753f72ca9bd052d5e0562dbc6f6ccfed
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3351466
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 243 | ||||
-rw-r--r-- | source/convert.cc | 40 | ||||
-rw-r--r-- | source/convert_argb.cc | 48 | ||||
-rw-r--r-- | source/convert_from.cc | 24 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 139 | ||||
-rw-r--r-- | source/planar_functions.cc | 104 | ||||
-rw-r--r-- | source/rotate_argb.cc | 8 | ||||
-rw-r--r-- | source/row_any.cc | 72 | ||||
-rw-r--r-- | source/row_lasx.cc | 1042 |
9 files changed, 1720 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 37825a39..957eb587 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -689,6 +689,38 @@ extern "C" { #define HAS_I422TOARGBROW_LASX #define HAS_I422TORGBAROW_LASX #define HAS_I422ALPHATOARGBROW_LASX +#define HAS_I422TOYUY2ROW_LASX +#define HAS_I422TOUYVYROW_LASX +#define HAS_MIRRORROW_LASX +#define HAS_MIRRORUVROW_LASX +#define HAS_ARGBMIRRORROW_LASX +#define HAS_I422TORGB24ROW_LASX +#define HAS_I422TORGB565ROW_LASX +#define HAS_I422TOARGB4444ROW_LASX +#define HAS_I422TOARGB1555ROW_LASX +#define HAS_YUY2TOUVROW_LASX +#define HAS_YUY2TOYROW_LASX +#define HAS_YUY2TOUV422ROW_LASX +#define HAS_UYVYTOYROW_LASX +#define HAS_UYVYTOUVROW_LASX +#define HAS_UYVYTOUV422ROW_LASX +#define HAS_ARGBTOYROW_LASX +#define HAS_ARGBTOUVROW_LASX +#define HAS_ARGBTORGB24ROW_LASX +#define HAS_ARGBTORAWROW_LASX +#define HAS_ARGBTORGB565ROW_LASX +#define HAS_ARGBTOARGB1555ROW_LASX +#define HAS_ARGBTOARGB4444ROW_LASX +#define HAS_ARGBTOUV444ROW_LASX +#define HAS_ARGBMULTIPLYROW_LASX +#define HAS_ARGBADDROW_LASX +#define HAS_ARGBSUBTRACTROW_LASX +#define HAS_ARGBATTENUATEROW_LASX +#define HAS_ARGBTORGB565DITHERROW_LASX +#define HAS_ARGBSHUFFLEROW_LASX +#define HAS_ARGBSHADEROW_LASX +#define HAS_ARGBGRAYROW_LASX +#define HAS_ARGBSEPIAROW_LASX #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1005,24 +1037,48 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); void NV12ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, @@ -1074,6 +1130,7 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1092,6 +1149,15 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_LASX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_MMI(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1341,6 +1407,8 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1439,6 +1507,15 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1678,6 +1755,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1685,15 +1763,18 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, @@ -1721,6 +1802,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1733,6 +1815,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, int width); void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, @@ -2537,6 +2620,10 @@ void ARGBShuffleRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); +void ARGBShuffleRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -2557,6 +2644,10 @@ void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); +void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, @@ -2777,6 +2868,20 @@ void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width); +void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); + +void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -3896,6 +4001,14 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB add images. void ARGBAddRow_C(const uint8_t* src_argb, @@ -3942,6 +4055,14 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBAddRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. @@ -3989,6 +4110,14 @@ void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBSubtractRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -4077,6 +4206,24 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); +void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); + +void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -4291,24 +4438,48 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -4365,11 +4536,17 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, int width); void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, @@ -4379,6 +4556,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4425,11 +4606,17 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, int width); void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -4439,6 +4626,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4485,11 +4676,17 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, int width); void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_LASX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUVRow_MMI(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, @@ -4499,6 +4696,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, @@ -4546,11 +4747,17 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, int width); void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -4560,6 +4767,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4679,6 +4890,11 @@ void I422ToYUY2Row_MMI(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width); +void I422ToYUY2Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4689,6 +4905,11 @@ void I422ToUYVYRow_MMI(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width); +void I422ToUYVYRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4699,6 +4920,11 @@ void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4709,6 +4935,11 @@ void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); // Effects related row functions. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); @@ -4727,6 +4958,9 @@ void ARGBAttenuateRow_MSA(const uint8_t* src_argb, void ARGBAttenuateRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4742,6 +4976,9 @@ void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); // Inverse table for unattenuate, shared by C and SSE2. extern const uint32_t fixed_invtbl8[256]; @@ -4766,12 +5003,14 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width); +void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width); void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, @@ -4849,6 +5088,10 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); +void ARGBShadeRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); // Used for blur. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, diff --git a/source/convert.cc b/source/convert.cc index c070bf81..1e524de3 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1095,6 +1095,16 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUVRow = YUY2ToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUVRow = YUY2ToUVRow_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -1186,6 +1196,16 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUVRow = UYVYToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUVRow = UYVYToUVRow_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -1440,6 +1460,16 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -2924,6 +2954,16 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif #endif { diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 21367c9b..7128e9f9 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -4668,6 +4668,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -4856,6 +4864,14 @@ int I420ToARGB1555(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, @@ -4937,6 +4953,14 @@ int I420ToARGB4444(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, @@ -5018,6 +5042,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); @@ -5140,6 +5172,14 @@ int I422ToRGB565(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB565Row = I422ToRGB565Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); @@ -5285,6 +5325,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; + } + } +#endif { // Allocate a row of argb. align_buffer_64(row_argb, width * 4); diff --git a/source/convert_from.cc b/source/convert_from.cc index 62a13d04..41a3c17a 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -452,6 +452,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -539,6 +547,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -614,6 +630,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 55c9ee61..6d147975 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -84,6 +84,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_LASX; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -124,6 +132,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -245,6 +261,17 @@ int ARGBToI422(const uint8_t* src_argb, } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); @@ -355,6 +382,16 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -519,6 +556,16 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -1015,6 +1062,16 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -1055,6 +1112,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_LASX; + } + } +#endif { // Allocate a rows of yuv. @@ -1180,6 +1245,16 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -1220,6 +1295,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_LASX; + } + } +#endif { // Allocate a rows of yuv. @@ -1305,6 +1388,14 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -1403,6 +1494,14 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -1477,6 +1576,14 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -1555,6 +1662,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, @@ -1632,6 +1747,14 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -1706,6 +1829,14 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1780,6 +1911,14 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 7cea06c8..af555338 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1728,6 +1728,16 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_LASX; + YUY2ToUV422Row = YUY2ToUV422Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -1824,6 +1834,16 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + UYVYToYRow = UYVYToYRow_Any_LASX; + UYVYToUV422Row = UYVYToUV422Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_LASX; + UYVYToUV422Row = UYVYToUV422Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -1968,6 +1988,14 @@ void MirrorPlane(const uint8_t* src_y, } } #endif +#if defined(HAS_MIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorRow = MirrorRow_Any_LASX; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_LASX; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -2026,6 +2054,14 @@ void MirrorUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_MIRRORUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + MirrorUVRow = MirrorUVRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_LASX; + } + } +#endif // MirrorUV plane for (y = 0; y < height; ++y) { @@ -2194,6 +2230,14 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -2602,6 +2646,14 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_LASX; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -2687,6 +2739,14 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAddRow = ARGBAddRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_LASX; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -2767,6 +2827,14 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LASX; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_LASX; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -3073,6 +3141,14 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -3178,6 +3254,11 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -3228,6 +3309,11 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBGrayRow = ARGBGrayRow_LASX; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); @@ -3276,6 +3362,11 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif +#if defined(HAS_ARGBSEPIAROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { + ARGBSepiaRow = ARGBSepiaRow_LASX; + } +#endif for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); @@ -3706,6 +3797,11 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif +#if defined(HAS_ARGBSHADEROW_LASX) + if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_LASX; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -3916,6 +4012,14 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index ae653886..4d36a910 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -163,6 +163,14 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_LASX; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; diff --git a/source/row_any.cc b/source/row_any.cc index 7d24b15c..b1b5f8a9 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -297,6 +297,9 @@ ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #ifdef HAS_I422TOYUY2ROW_MMI ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) #endif +#ifdef HAS_I422TOYUY2ROW_LASX +ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif @@ -306,6 +309,9 @@ ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #ifdef HAS_I422TOUYVYROW_MMI ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) #endif +#ifdef HAS_I422TOUYVYROW_LASX +ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif @@ -425,6 +431,10 @@ ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7) #ifdef HAS_I422TOARGBROW_LASX ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31) ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31) +ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31) +ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31) +ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31) #endif #undef ANY31C @@ -631,18 +641,27 @@ ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #ifdef HAS_ARGBMULTIPLYROW_MMI ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) #endif +#ifdef HAS_ARGBMULTIPLYROW_LASX +ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7) +#endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_MMI ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) #endif +#ifdef HAS_ARGBADDROW_LASX +ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7) +#endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_MMI ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) #endif +#ifdef HAS_ARGBSUBTRACTROW_LASX +ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif @@ -953,6 +972,13 @@ ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) #endif +#if defined(HAS_ARGBTORGB24ROW_LASX) +ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31) +ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31) +ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15) +ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif @@ -1007,6 +1033,9 @@ ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_MMI ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYROW_LASX +ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif @@ -1115,12 +1144,18 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_LASX +ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) +#endif #ifdef HAS_YUY2TOYROW_MMI ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) #endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_UYVYTOYROW_LASX +ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) +#endif #ifdef HAS_UYVYTOYROW_MMI ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) #endif @@ -1205,6 +1240,9 @@ ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MMI ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) #endif +#ifdef HAS_ARGBATTENUATEROW_LASX +ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif @@ -1354,6 +1392,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MMI, 2, 3) #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LASX) +ANY11P(ARGBToRGB565DitherRow_Any_LASX, + ARGBToRGB565DitherRow_LASX, + const uint32_t, + 4, + 2, + 15) +#endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif @@ -1369,6 +1415,9 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #ifdef HAS_ARGBSHUFFLEROW_MMI ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) #endif +#ifdef HAS_ARGBSHUFFLEROW_LASX +ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) +#endif #undef ANY11P #undef ANY11P @@ -1667,6 +1716,9 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #ifdef HAS_MIRRORROW_MMI ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) #endif +#ifdef HAS_MIRRORROW_LASX +ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) +#endif #ifdef HAS_MIRRORUVROW_AVX2 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) #endif @@ -1679,6 +1731,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #ifdef HAS_MIRRORUVROW_MSA ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) #endif +#ifdef HAS_MIRRORUVROW_LASX +ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -1691,6 +1746,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif +#ifdef HAS_ARGBMIRRORROW_LASX +ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) +#endif #ifdef HAS_ARGBMIRRORROW_MMI ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) #endif @@ -1791,6 +1849,11 @@ ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) #endif +#ifdef HAS_YUY2TOUV422ROW_LASX +ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) +ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31) +#endif #undef ANY12 // Any 2 16 bit planes with parameter to 1 @@ -1951,6 +2014,9 @@ ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #ifdef HAS_ARGBTOUVROW_MMI ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_LASX +ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif @@ -2047,9 +2113,15 @@ ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #ifdef HAS_YUY2TOUVROW_MMI ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) #endif +#ifdef HAS_YUY2TOUVROW_LASX +ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) +#endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_UYVYTOUVROW_LASX +ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) +#endif #ifdef HAS_UYVYTOUVROW_MMI ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) #endif diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 0e7b38a1..b9c7cc16 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -197,6 +197,125 @@ extern "C" { pdst_argb += 64; \ } +void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 64; + __m256i src0, src1; + __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607, + 0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, + src1, src1, shuffler, src0, src1); + src0 = __lasx_xvpermi_q(src0, src0, 0x01); + src1 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(src1, dst, 0); + __lasx_xvst(src0, dst, 32); + dst += 64; + src -= 64; + } +} + +void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 16; + __m256i src, dst; + __m256i shuffler = {0x0004000500060007, 0x0000000100020003, + 0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 16) << 1; + for (x = 0; x < len; x++) { + src = __lasx_xvld(src_uv, 0); + dst = __lasx_xvshuf_h(shuffler, src, src); + dst = __lasx_xvpermi_q(dst, dst, 0x01); + __lasx_xvst(dst, dst_uv, 0); + src_uv -= 32; + dst_uv += 32; + } +} + +void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 16; + __m256i src0, src1; + __m256i dst0, dst1; + __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504, + 0x0B0A09080F0E0D0C, 0x0302010007060504}; + src += (width * 4) - 64; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1); + DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, + src1, src1, shuffler, src0, src1); + dst1 = __lasx_xvpermi_q(src0, src0, 0x01); + dst0 = __lasx_xvpermi_q(src1, src1, 0x01); + __lasx_xvst(dst0, dst, 0); + __lasx_xvst(dst1, dst, 32); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_yuy2_0, vec_yuy2_1; + __m256i dst_yuy2_0, dst_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0); + dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20); + dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31); + __lasx_xvst(dst_yuy2_0, dst_yuy2, 0); + __lasx_xvst(dst_yuy2_1, dst_yuy2, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 32; + __m256i src_u0, src_v0, src_y0, vec_uv0; + __m256i vec_uyvy0, vec_uyvy1; + __m256i dst_uyvy0, dst_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lasx_xvld(src_y, 0); + src_u0 = __lasx_xvpermi_d(src_u0, 0xD8); + src_v0 = __lasx_xvpermi_d(src_v0, 0xD8); + vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0); + vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0); + dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20); + dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31); + __lasx_xvst(dst_uyvy0, dst_uyvy, 0); + __lasx_xvst(dst_uyvy1, dst_uyvy, 32); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy +=64; + } +} + void I422ToARGBRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -295,6 +414,929 @@ void I422AlphaToARGBRow_LASX(const uint8_t* src_y, } } +void I422ToRGB24Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614, + 0x0504120302100100, 0x0A18090816070614}; + __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, + 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, + vec_yb, b_l, b_h, g_l, g_h, r_l, r_h); + temp0 = __lasx_xvpackev_b(g_l, b_l); + temp1 = __lasx_xvpackev_b(g_h, b_h); + DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, + r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, temp1); + + b_l = __lasx_xvilvl_d(temp1, temp2); + b_h = __lasx_xvilvh_d(temp3, temp1); + temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20); + temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30); + temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31); + __lasx_xvst(temp1, dst_argb, 0); + __lasx_xvst(temp2, dst_argb, 32); + __lasx_xvst(temp3, dst_argb, 64); + dst_argb += 96; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, + vec_yb, b_l, b_h, g_l, g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 2); + g_h = __lasx_xvsrli_h(g_h, 2); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 11); + r_h = __lasx_xvslli_h(r_h, 11); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvor_v(r_l, g_l); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_rgb565, 0); + __lasx_xvst(dst_h, dst_rgb565, 32); + dst_rgb565 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, + 0xF000F000F000F000, 0xF000F000F000F000}; + __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, + 0x00F000F000F000F0, 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, + vec_yb, b_l, b_h, g_l, g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 4); + b_h = __lasx_xvsrli_h(b_h, 4); + r_l = __lasx_xvsrli_h(r_l, 4); + r_h = __lasx_xvsrli_h(r_h, 4); + g_l = __lasx_xvand_v(g_l, mask); + g_h = __lasx_xvand_v(g_h, mask); + r_l = __lasx_xvslli_h(r_l, 8); + r_h = __lasx_xvslli_h(r_h, 8); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb4444, 0); + __lasx_xvst(dst_h, dst_argb4444, 32); + dst_argb4444 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void I422ToARGB1555Row_LASX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 32; + __m256i vec_yb, vec_yg; + __m256i vec_ubvr, vec_ugvg; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = {0x8000800080008000, 0x8000800080008000, + 0x8000800080008000, 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + + for (x = 0; x < len; x++) { + __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m256i dst_l, dst_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, + vec_yb, b_l, b_h, g_l, g_h, r_l, r_h); + b_l = __lasx_xvsrli_h(b_l, 3); + b_h = __lasx_xvsrli_h(b_h, 3); + g_l = __lasx_xvsrli_h(g_l, 3); + g_h = __lasx_xvsrli_h(g_h, 3); + g_l = __lasx_xvslli_h(g_l, 5); + g_h = __lasx_xvslli_h(g_h, 5); + r_l = __lasx_xvsrli_h(r_l, 3); + r_h = __lasx_xvsrli_h(r_h, 3); + r_l = __lasx_xvslli_h(r_l, 10); + r_h = __lasx_xvslli_h(r_h, 10); + r_l = __lasx_xvor_v(r_l, alpha); + r_h = __lasx_xvor_v(r_h, alpha); + r_l = __lasx_xvor_v(r_l, g_l); + r_h = __lasx_xvor_v(r_h, g_h); + r_l = __lasx_xvor_v(r_l, b_l); + r_h = __lasx_xvor_v(r_h, b_h); + dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20); + dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31); + __lasx_xvst(dst_l, dst_argb1555, 0); + __lasx_xvst(dst_h, dst_argb1555, 32); + dst_argb1555 += 64; + src_y += 32; + src_u += 16; + src_v += 16; + } +} + +void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + dst0 = __lasx_xvpickev_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0, + src_yuy2_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickod_b(src1, src0); + src1 = __lasx_xvpickod_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1); + tmp0 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + dst0 = __lasx_xvpickod_b(src1, src0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_LASX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0, + src_uyvy_next, 32, src0, src1, src2, src3); + src0 = __lasx_xvpickev_b(src1, src0); + src1 = __lasx_xvpickev_b(src3, src2); + tmp0 = __lasx_xvavgr_bu(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); + dst0 = __lasx_xvpickev_b(tmp0, tmp0); + dst1 = __lasx_xvpickod_b(tmp0, tmp0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3; + __m256i tmp0, tmp1, dst0; + __m256i const_19 = __lasx_xvldi(0x19); + __m256i const_42 = __lasx_xvldi(0x42); + __m256i const_81 = __lasx_xvldi(0x81); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i control = {0x0000000400000000, 0x0000000500000001, + 0x0000000600000002, 0x0000000700000003}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, + src_argb0, 96, src0, src1, src2, src3); + vec0 = __lasx_xvpickev_b(src1, src0); + vec1 = __lasx_xvpickev_b(src3, src2); + vec2 = __lasx_xvpickod_b(src1, src0); + vec3 = __lasx_xvpickod_b(src3, src2); + tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19); + tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19); + tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81); + tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81); + tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42); + tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42); + dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8); + dst0 = __lasx_xvperm_w(dst0, control); + __lasx_xvst(dst0, dst_y, 0); + src_argb0 += 128; + dst_y += 32; + } +} + +void ARGBToUVRow_LASX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m256i src0, src1, src2, src3, src4, src5, src6, src7; + __m256i vec0, vec1, vec2, vec3; + __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038, + 0x0038003800380038, 0x0038003800380038}; + __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025, + 0x0025002500250025, 0x0025002500250025}; + __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013, + 0x0013001300130013, 0x0013001300130013}; + __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f, + 0x002f002f002f002f, 0x002f002f002f002f}; + __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009, + 0x0009000900090009, 0x0009000900090009}; + __m256i control = {0x0000000400000000, 0x0000000500000001, + 0x0000000600000002, 0x0000000700000003}; + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, + src_argb0, 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64, + src_argb1, 96, src4, src5, src6, src7); + vec0 = __lasx_xvaddwev_h_bu(src0, src4); + vec1 = __lasx_xvaddwev_h_bu(src1, src5); + vec2 = __lasx_xvaddwev_h_bu(src2, src6); + vec3 = __lasx_xvaddwev_h_bu(src3, src7); + tmp0 = __lasx_xvpickev_h(vec1, vec0); + tmp1 = __lasx_xvpickev_h(vec3, vec2); + tmp2 = __lasx_xvpickod_h(vec1, vec0); + tmp3 = __lasx_xvpickod_h(vec3, vec2); + vec0 = __lasx_xvaddwod_h_bu(src0, src4); + vec1 = __lasx_xvaddwod_h_bu(src1, src5); + vec2 = __lasx_xvaddwod_h_bu(src2, src6); + vec3 = __lasx_xvaddwod_h_bu(src3, src7); + tmp4 = __lasx_xvpickev_h(vec1, vec0); + tmp5 = __lasx_xvpickev_h(vec3, vec2); + vec0 = __lasx_xvpickev_h(tmp1, tmp0); + vec1 = __lasx_xvpickod_h(tmp1, tmp0); + src0 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp3, tmp2); + vec1 = __lasx_xvpickod_h(tmp3, tmp2); + src1 = __lasx_xvavgr_h(vec0, vec1); + vec0 = __lasx_xvpickev_h(tmp5, tmp4); + vec1 = __lasx_xvpickod_h(tmp5, tmp4); + src2 = __lasx_xvavgr_h(vec0, vec1); + dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A); + dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26); + dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E); + dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12); + dst0 = __lasx_xvperm_w(dst0, control); + dst1 = __lasx_xvperm_w(dst1, control); + dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8); + dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst1, dst_v, 0, 0); + __lasx_xvstelm_d(dst1, dst_v, 8, 2); + src_argb0 += 128; + src_argb1 += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, + 0x0908060504020100, 0x000000000E0D0C0A}; + __m256i control = {0x0000000100000000, 0x0000000400000002, + 0x0000000600000005, 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, + src_argb, 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, + src_argb, 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 32) - 1; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, + 0x090A040506000102, 0x000000000C0D0E08}; + __m256i control = {0x0000000100000000, 0x0000000400000002, + 0x0000000600000005, 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, + src_argb, 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + __lasx_xvst(tmp3, dst_rgb, 72); + dst_rgb += 96; + src_argb += 128; + } + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, + src_argb, 96, src0, src1, src2, src3); + tmp0 = __lasx_xvshuf_b(src0, src0, shuf); + tmp1 = __lasx_xvshuf_b(src1, src1, shuf); + tmp2 = __lasx_xvshuf_b(src2, src2, shuf); + tmp3 = __lasx_xvshuf_b(src3, src3, shuf); + tmp0 = __lasx_xvperm_w(tmp0, control); + tmp1 = __lasx_xvperm_w(tmp1, control); + tmp2 = __lasx_xvperm_w(tmp2, control); + tmp3 = __lasx_xvperm_w(tmp3, control); + __lasx_xvst(tmp0, dst_rgb, 0); + __lasx_xvst(tmp1, dst_rgb, 24); + __lasx_xvst(tmp2, dst_rgb, 48); + dst_rgb += 72; + __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0); + __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1); + __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2); +} + +void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i shift = {0x0300030003000300, 0x0300030003000300, + 0x0300030003000300, 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvpackev_b(zero, tmp1); + tmp1 = __lasx_xvsrli_h(tmp1, 2); + tmp0 = __lasx_xvsll_b(tmp0, shift); + tmp1 = __lasx_xvslli_h(tmp1, 5); + dst0 = __lasx_xvor_v(tmp0, tmp1); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m256i shift1 = {0x0703070307030703, 0x0703070307030703, + 0x0703070307030703, 0x0703070307030703}; + __m256i shift2 = {0x0200020002000200, 0x0200020002000200, + 0x0200020002000200, 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp0 = __lasx_xvsrli_b(tmp0, 3); + tmp1 = __lasx_xvsrl_b(tmp1, shift1); + tmp0 = __lasx_xvsll_b(tmp0, shift2); + tmp2 = __lasx_xvpackev_b(zero, tmp1); + tmp3 = __lasx_xvpackod_b(zero, tmp1); + tmp2 = __lasx_xvslli_h(tmp2, 5); + tmp3 = __lasx_xvslli_h(tmp3, 15); + dst0 = __lasx_xvor_v(tmp0, tmp2); + dst0 = __lasx_xvor_v(dst0, tmp3); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp1 = __lasx_xvandi_b(tmp1, 0xF0); + tmp0 = __lasx_xvsrli_b(tmp0, 4); + dst0 = __lasx_xvor_v(tmp1, tmp0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + dst_rgb += 32; + src_argb += 64; + } +} + +void ARGBToUV444Row_LASX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3, dst0, dst1; + __m256i const_112 = __lasx_xvldi(112); + __m256i const_74 = __lasx_xvldi(74); + __m256i const_38 = __lasx_xvldi(38); + __m256i const_94 = __lasx_xvldi(94); + __m256i const_18 = __lasx_xvldi(18); + __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i control = {0x0000000400000000, 0x0000000500000001, + 0x0000000600000002, 0x0000000700000003}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, + src_argb, 96, src0, src1, src2, src3); + tmp0 = __lasx_xvpickev_h(src1, src0); + tmp1 = __lasx_xvpickod_h(src1, src0); + tmp2 = __lasx_xvpickev_h(src3, src2); + tmp3 = __lasx_xvpickod_h(src3, src2); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74); + reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74); + reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst0 = __lasx_xvperm_w(dst0, control); + reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18); + reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18); + reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lasx_xvsub_h(reg0, reg2); + reg1 = __lasx_xvsub_h(reg1, reg3); + dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8); + dst1 = __lasx_xvperm_w(dst1, control); + __lasx_xvst(dst0, dst_u, 0); + __lasx_xvst(dst1, dst_v, 0); + dst_u += 32; + dst_v += 32; + src_argb += 128; + } +} + +void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i zero = __lasx_xvldi(0); + __m256i src0, src1, dst0, dst1; + __m256i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp2 = __lasx_xvilvl_b(zero, src1); + tmp3 = __lasx_xvilvh_b(zero, src1); + dst0 = __lasx_xvmuh_hu(tmp0, tmp2); + dst1 = __lasx_xvmuh_hu(tmp1, tmp3); + dst0 = __lasx_xvpickev_b(dst1, dst0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAddRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvsadd_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_LASX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m256i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lasx_xvssub_bu(src0, src1); + __lasx_xvst(dst0, dst_argb, 0); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, reg3, reg4, reg5; + __m256i b, g, r, a, dst0, dst1; + __m256i control = {0x0005000100040000, 0x0007000300060002, + 0x0005000100040000, 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(tmp0, tmp0); + r = __lasx_xvpackod_b(tmp0, tmp0); + g = __lasx_xvpackev_b(tmp1, tmp1); + a = __lasx_xvpackod_b(tmp1, tmp1); + reg0 = __lasx_xvmulwev_w_hu(b, a); + reg1 = __lasx_xvmulwod_w_hu(b, a); + reg2 = __lasx_xvmulwev_w_hu(r, a); + reg3 = __lasx_xvmulwod_w_hu(r, a); + reg4 = __lasx_xvmulwev_w_hu(g, a); + reg5 = __lasx_xvmulwod_w_hu(g, a); + reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24); + reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24); + reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24); + reg0 = __lasx_xvshuf_h(control, reg0, reg0); + reg2 = __lasx_xvshuf_h(control, reg2, reg2); + reg4 = __lasx_xvshuf_h(control, reg4, reg4); + tmp0 = __lasx_xvpackev_b(reg4, reg0); + tmp1 = __lasx_xvpackev_b(a, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + src_argb += 64; + } +} + +void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1, dst0; + __m256i b, g, r; + __m256i zero = __lasx_xvldi(0); + __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0); + + vec_dither = __lasx_xvilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + b = __lasx_xvpackev_b(zero, tmp0); + r = __lasx_xvpackod_b(zero, tmp0); + g = __lasx_xvpackev_b(zero, tmp1); + b = __lasx_xvadd_h(b, vec_dither); + g = __lasx_xvadd_h(g, vec_dither); + r = __lasx_xvadd_h(r, vec_dither); + DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g); + r = __lasx_xvclip255_h(r); + b = __lasx_xvsrai_h(b, 3); + g = __lasx_xvsrai_h(g, 2); + r = __lasx_xvsrai_h(r, 3); + g = __lasx_xvslli_h(g, 5); + r = __lasx_xvslli_h(r, 11); + dst0 = __lasx_xvor_v(b, g); + dst0 = __lasx_xvor_v(dst0, r); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_rgb, 0); + src_argb += 64; + dst_rgb += 32; + } +} + +void ARGBShuffleRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 16; + __m256i src0, src1, dst0, dst1; + __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, + 0x0404040400000000, 0x0C0C0C0C08080808}; + __m256i temp = __lasx_xvldrepl_w(shuffler, 0); + + shuf = __lasx_xvadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + dst0 = __lasx_xvshuf_b(src0, src0, shuf); + dst1 = __lasx_xvshuf_b(src1, src1, shuf); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBShadeRow_LASX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 8; + __m256i src0, dst0, tmp0, tmp1; + __m256i vec_value = __lasx_xvreplgr2vr_w(value); + + vec_value = __lasx_xvilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb, 0); + tmp0 = __lasx_xvilvl_b(src0, src0); + tmp1 = __lasx_xvilvh_b(src0, src0); + tmp0 = __lasx_xvmuh_hu(tmp0, vec_value); + tmp1 = __lasx_xvmuh_hu(tmp1, vec_value); + dst0 = __lasx_xvpickod_b(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, reg2, dst0, dst1; + __m256i const_128 = __lasx_xvldi(0x480); + __m256i const_150 = __lasx_xvldi(0x96); + __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, + 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + reg0 = __lasx_xvdp2_h_bu(tmp0, const_br); + reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lasx_xvadd_h(reg0, reg1); + tmp0 = __lasx_xvpackod_b(reg2, reg2); + tmp1 = __lasx_xvpackod_b(tmp1, reg2); + dst0 = __lasx_xvilvl_h(tmp1, tmp0); + dst1 = __lasx_xvilvh_h(tmp1, tmp0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + src_argb += 64; + dst_argb += 64; + } +} + +void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) { + int x; + int len = width / 16; + __m256i src0, src1, tmp0, tmp1; + __m256i reg0, reg1, spb, spg, spr; + __m256i dst0, dst1; + __m256i spb_g = __lasx_xvldi(68); + __m256i spg_g = __lasx_xvldi(88); + __m256i spr_g = __lasx_xvldi(98); + __m256i spb_br = {0x2311231123112311, 0x2311231123112311, + 0x2311231123112311, 0x2311231123112311}; + __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, + 0x2D162D162D162D16, 0x2D162D162D162D16}; + __m256i spr_br = {0x3218321832183218, 0x3218321832183218, + 0x3218321832183218, 0x3218321832183218}; + __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, + 0x1706150413021100, 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lasx_xvdp2_h_bu(tmp0, spr_br); + spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lasx_xvsrli_h(spb, 7); + spg = __lasx_xvsrli_h(spg, 7); + spr = __lasx_xvsrli_h(spr, 7); + spg = __lasx_xvsat_hu(spg, 7); + spr = __lasx_xvsat_hu(spr, 7); + reg0 = __lasx_xvpackev_b(spg, spb); + reg1 = __lasx_xvshuf_b(tmp1, spr, shuff); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + dst_argb += 64; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv |