aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHao Chen <chenhao@loongson.cn>2021-12-20 19:57:26 +0800
committerFrank Barchard <fbarchard@chromium.org>2022-01-21 01:34:38 +0000
commitde8ae8c679f5a42fb9f9f65318d6cb95112180d6 (patch)
tree4f504ae4587084990aa39a10f820591f40ff30ed
parent51de1e16f20bb93468d7c538629b40ece8420b71 (diff)
downloadlibyuv-de8ae8c679f5a42fb9f9f65318d6cb95112180d6.tar.gz
Add optimization functions in row_lasx.cc file.
Optimize 32 functions in source/row_lasx.cc file. All test cases passed on loongarch platform. Bug: libyuv:912 Signed-off-by: Hao Chen <chenhao@loongson.cn> Change-Id: I7d3f649f753f72ca9bd052d5e0562dbc6f6ccfed Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3351466 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--include/libyuv/row.h243
-rw-r--r--source/convert.cc40
-rw-r--r--source/convert_argb.cc48
-rw-r--r--source/convert_from.cc24
-rw-r--r--source/convert_from_argb.cc139
-rw-r--r--source/planar_functions.cc104
-rw-r--r--source/rotate_argb.cc8
-rw-r--r--source/row_any.cc72
-rw-r--r--source/row_lasx.cc1042
9 files changed, 1720 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 37825a39..957eb587 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -689,6 +689,38 @@ extern "C" {
#define HAS_I422TOARGBROW_LASX
#define HAS_I422TORGBAROW_LASX
#define HAS_I422ALPHATOARGBROW_LASX
+#define HAS_I422TOYUY2ROW_LASX
+#define HAS_I422TOUYVYROW_LASX
+#define HAS_MIRRORROW_LASX
+#define HAS_MIRRORUVROW_LASX
+#define HAS_ARGBMIRRORROW_LASX
+#define HAS_I422TORGB24ROW_LASX
+#define HAS_I422TORGB565ROW_LASX
+#define HAS_I422TOARGB4444ROW_LASX
+#define HAS_I422TOARGB1555ROW_LASX
+#define HAS_YUY2TOUVROW_LASX
+#define HAS_YUY2TOYROW_LASX
+#define HAS_YUY2TOUV422ROW_LASX
+#define HAS_UYVYTOYROW_LASX
+#define HAS_UYVYTOUVROW_LASX
+#define HAS_UYVYTOUV422ROW_LASX
+#define HAS_ARGBTOYROW_LASX
+#define HAS_ARGBTOUVROW_LASX
+#define HAS_ARGBTORGB24ROW_LASX
+#define HAS_ARGBTORAWROW_LASX
+#define HAS_ARGBTORGB565ROW_LASX
+#define HAS_ARGBTOARGB1555ROW_LASX
+#define HAS_ARGBTOARGB4444ROW_LASX
+#define HAS_ARGBTOUV444ROW_LASX
+#define HAS_ARGBMULTIPLYROW_LASX
+#define HAS_ARGBADDROW_LASX
+#define HAS_ARGBSUBTRACTROW_LASX
+#define HAS_ARGBATTENUATEROW_LASX
+#define HAS_ARGBTORGB565DITHERROW_LASX
+#define HAS_ARGBSHUFFLEROW_LASX
+#define HAS_ARGBSHADEROW_LASX
+#define HAS_ARGBGRAYROW_LASX
+#define HAS_ARGBSEPIAROW_LASX
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -1005,24 +1037,48 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB565Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB4444Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB1555Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV12ToARGBRow_MSA(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
@@ -1074,6 +1130,7 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1092,6 +1149,15 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_LASX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUV444Row_MMI(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1341,6 +1407,8 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@@ -1439,6 +1507,15 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1678,6 +1755,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1685,15 +1763,18 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
@@ -1721,6 +1802,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -1733,6 +1815,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
int width);
void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
@@ -2537,6 +2620,10 @@ void ARGBShuffleRow_MMI(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width);
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
@@ -2557,6 +2644,10 @@ void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint8_t* param,
int width);
+void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
@@ -2777,6 +2868,20 @@ void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
uint8_t* dst_rgb,
const uint32_t dither4,
int width);
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3896,6 +4001,14 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB add images.
void ARGBAddRow_C(const uint8_t* src_argb,
@@ -3942,6 +4055,14 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
@@ -3989,6 +4110,14 @@ void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -4077,6 +4206,24 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint32_t param,
int width);
+void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
+void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@@ -4291,24 +4438,48 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
@@ -4365,11 +4536,17 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width);
void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_u,
@@ -4379,6 +4556,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -4425,11 +4606,17 @@ void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
int width);
void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
@@ -4439,6 +4626,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -4485,11 +4676,17 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width);
void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
int src_stride_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
int src_stride_uyvy,
uint8_t* dst_u,
@@ -4499,6 +4696,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -4546,11 +4747,17 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
int width);
void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
int src_stride_ptr,
uint8_t* dst_u,
@@ -4560,6 +4767,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
+void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -4679,6 +4890,11 @@ void I422ToYUY2Row_MMI(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width);
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
void I422ToUYVYRow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -4689,6 +4905,11 @@ void I422ToUYVYRow_MMI(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width);
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -4699,6 +4920,11 @@ void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -4709,6 +4935,11 @@ void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
+void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
// Effects related row functions.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
@@ -4727,6 +4958,9 @@ void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
uint8_t* dst_argb,
int width);
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@@ -4742,6 +4976,9 @@ void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
+void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern const uint32_t fixed_invtbl8[256];
@@ -4766,12 +5003,14 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
void ARGBColorMatrixRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
@@ -4849,6 +5088,10 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
uint32_t value);
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
// Used for blur.
void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
diff --git a/source/convert.cc b/source/convert.cc
index c070bf81..1e524de3 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1095,6 +1095,16 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -1186,6 +1196,16 @@ int UYVYToI420(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUVRow = UYVYToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUVRow = UYVYToUVRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -1440,6 +1460,16 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -2924,6 +2954,16 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
#endif
{
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 21367c9b..7128e9f9 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -4668,6 +4668,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -4856,6 +4864,14 @@ int I420ToARGB1555(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
@@ -4937,6 +4953,14 @@ int I420ToARGB4444(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
@@ -5018,6 +5042,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
@@ -5140,6 +5172,14 @@ int I422ToRGB565(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
@@ -5285,6 +5325,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
+ }
+ }
+#endif
{
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 62a13d04..41a3c17a 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -452,6 +452,14 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -539,6 +547,14 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -614,6 +630,14 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 55c9ee61..6d147975 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -84,6 +84,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV444Row = ARGBToUV444Row_LASX;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -124,6 +132,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -245,6 +261,17 @@ int ARGBToI422(const uint8_t* src_argb,
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
+
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
@@ -355,6 +382,16 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -519,6 +556,16 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -1015,6 +1062,16 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -1055,6 +1112,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -1180,6 +1245,16 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1220,6 +1295,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
@@ -1305,6 +1388,14 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYRow(src_argb, dst_y, width);
@@ -1403,6 +1494,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1477,6 +1576,14 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRAWRow = ARGBToRAWRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1555,6 +1662,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
@@ -1632,6 +1747,14 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -1706,6 +1829,14 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1780,6 +1911,14 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBTOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB4444Row(src_argb, dst_argb4444, width);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 7cea06c8..af555338 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1728,6 +1728,16 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -1824,6 +1834,16 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -1968,6 +1988,14 @@ void MirrorPlane(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_MIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorRow = MirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_LASX;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -2026,6 +2054,14 @@ void MirrorUVPlane(const uint8_t* src_uv,
}
}
#endif
+#if defined(HAS_MIRRORUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorUVRow = MirrorUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_LASX;
+ }
+ }
+#endif
// MirrorUV plane
for (y = 0; y < height; ++y) {
@@ -2194,6 +2230,14 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -2602,6 +2646,14 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_LASX;
+ }
+ }
+#endif
// Multiply plane
for (y = 0; y < height; ++y) {
@@ -2687,6 +2739,14 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBADDROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAddRow = ARGBAddRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_LASX;
+ }
+ }
+#endif
// Add plane
for (y = 0; y < height; ++y) {
@@ -2767,6 +2827,14 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBSUBTRACTROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_LASX;
+ }
+ }
+#endif
// Subtract plane
for (y = 0; y < height; ++y) {
@@ -3073,6 +3141,14 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -3178,6 +3254,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
@@ -3228,6 +3309,11 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
@@ -3276,6 +3362,11 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_MSA;
}
#endif
+#if defined(HAS_ARGBSEPIAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_LASX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
@@ -3706,6 +3797,11 @@ int ARGBShade(const uint8_t* src_argb,
ARGBShadeRow = ARGBShadeRow_MSA;
}
#endif
+#if defined(HAS_ARGBSHADEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_LASX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -3916,6 +4012,14 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
+#if defined(HAS_ARGBSHUFFLEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index ae653886..4d36a910 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -163,6 +163,14 @@ static int ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
diff --git a/source/row_any.cc b/source/row_any.cc
index 7d24b15c..b1b5f8a9 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -297,6 +297,9 @@ ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
#ifdef HAS_I422TOYUY2ROW_MMI
ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
#endif
+#ifdef HAS_I422TOYUY2ROW_LASX
+ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
+#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
@@ -306,6 +309,9 @@ ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
#ifdef HAS_I422TOUYVYROW_MMI
ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
#endif
+#ifdef HAS_I422TOUYVYROW_LASX
+ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
+#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
@@ -425,6 +431,10 @@ ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
#ifdef HAS_I422TOARGBROW_LASX
ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31)
+ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31)
#endif
#undef ANY31C
@@ -631,18 +641,27 @@ ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#ifdef HAS_ARGBMULTIPLYROW_MMI
ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
#endif
+#ifdef HAS_ARGBMULTIPLYROW_LASX
+ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
+#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBADDROW_MMI
ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
#endif
+#ifdef HAS_ARGBADDROW_LASX
+ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
+#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_MMI
ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
#endif
+#ifdef HAS_ARGBSUBTRACTROW_LASX
+ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
+#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
@@ -953,6 +972,13 @@ ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
#endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15)
+#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif
@@ -1007,6 +1033,9 @@ ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#ifdef HAS_ARGBTOYROW_MMI
ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
#endif
+#ifdef HAS_ARGBTOYROW_LASX
+ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
+#endif
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
@@ -1115,12 +1144,18 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
#ifdef HAS_YUY2TOYROW_MSA
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_YUY2TOYROW_LASX
+ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
+#endif
#ifdef HAS_YUY2TOYROW_MMI
ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
#endif
#ifdef HAS_UYVYTOYROW_MSA
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_UYVYTOYROW_LASX
+ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
+#endif
#ifdef HAS_UYVYTOYROW_MMI
ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
#endif
@@ -1205,6 +1240,9 @@ ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_MMI
ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
#endif
+#ifdef HAS_ARGBATTENUATEROW_LASX
+ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
+#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
@@ -1354,6 +1392,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MMI,
2,
3)
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ANY11P(ARGBToRGB565DitherRow_Any_LASX,
+ ARGBToRGB565DitherRow_LASX,
+ const uint32_t,
+ 4,
+ 2,
+ 15)
+#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
#endif
@@ -1369,6 +1415,9 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
#ifdef HAS_ARGBSHUFFLEROW_MMI
ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
#endif
+#ifdef HAS_ARGBSHUFFLEROW_LASX
+ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
+#endif
#undef ANY11P
#undef ANY11P
@@ -1667,6 +1716,9 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#ifdef HAS_MIRRORROW_MMI
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
#endif
+#ifdef HAS_MIRRORROW_LASX
+ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
+#endif
#ifdef HAS_MIRRORUVROW_AVX2
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
#endif
@@ -1679,6 +1731,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
#ifdef HAS_MIRRORUVROW_MSA
ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
#endif
+#ifdef HAS_MIRRORUVROW_LASX
+ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -1691,6 +1746,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#endif
+#ifdef HAS_ARGBMIRRORROW_LASX
+ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
+#endif
#ifdef HAS_ARGBMIRRORROW_MMI
ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
#endif
@@ -1791,6 +1849,11 @@ ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
#endif
+#ifdef HAS_YUY2TOUV422ROW_LASX
+ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
+ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
+#endif
#undef ANY12
// Any 2 16 bit planes with parameter to 1
@@ -1951,6 +2014,9 @@ ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#ifdef HAS_ARGBTOUVROW_MMI
ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
#endif
+#ifdef HAS_ARGBTOUVROW_LASX
+ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
+#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
@@ -2047,9 +2113,15 @@ ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
#ifdef HAS_YUY2TOUVROW_MMI
ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
#endif
+#ifdef HAS_YUY2TOUVROW_LASX
+ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
+#endif
#ifdef HAS_UYVYTOUVROW_MSA
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
#endif
+#ifdef HAS_UYVYTOUVROW_LASX
+ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
+#endif
#ifdef HAS_UYVYTOUVROW_MMI
ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
#endif
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
index 0e7b38a1..b9c7cc16 100644
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -197,6 +197,125 @@ extern "C" {
pdst_argb += 64; \
}
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 64;
+ __m256i src0, src1;
+ __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
+ 0x08090A0B0C0D0E0F, 0x0001020304050607};
+ src += width - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler,
+ src1, src1, shuffler, src0, src1);
+ src0 = __lasx_xvpermi_q(src0, src0, 0x01);
+ src1 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(src1, dst, 0);
+ __lasx_xvst(src0, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src, dst;
+ __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
+ 0x0004000500060007, 0x0000000100020003};
+
+ src_uv += (width - 16) << 1;
+ for (x = 0; x < len; x++) {
+ src = __lasx_xvld(src_uv, 0);
+ dst = __lasx_xvshuf_h(shuffler, src, src);
+ dst = __lasx_xvpermi_q(dst, dst, 0x01);
+ __lasx_xvst(dst, dst_uv, 0);
+ src_uv -= 32;
+ dst_uv += 32;
+ }
+}
+
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1;
+ __m256i dst0, dst1;
+ __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
+ 0x0B0A09080F0E0D0C, 0x0302010007060504};
+ src += (width * 4) - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler,
+ src1, src1, shuffler, src0, src1);
+ dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
+ dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(dst0, dst, 0);
+ __lasx_xvst(dst1, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_yuy2_0, vec_yuy2_1;
+ __m256i dst_yuy2_0, dst_yuy2_1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
+ vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
+ dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
+ dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
+ __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
+ __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_yuy2 += 64;
+ }
+}
+
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_uyvy0, vec_uyvy1;
+ __m256i dst_uyvy0, dst_uyvy1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
+ vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
+ dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
+ dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
+ __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
+ __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_uyvy +=64;
+ }
+}
+
void I422ToARGBRow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -295,6 +414,929 @@ void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
}
}
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
+ 0x0504120302100100, 0x0A18090816070614};
+ __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
+ 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i temp0, temp1, temp2, temp3;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
+ vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
+ temp0 = __lasx_xvpackev_b(g_l, b_l);
+ temp1 = __lasx_xvpackev_b(g_h, b_h);
+ DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
+ r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, temp1);
+
+ b_l = __lasx_xvilvl_d(temp1, temp2);
+ b_h = __lasx_xvilvh_d(temp3, temp1);
+ temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
+ temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
+ temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
+ __lasx_xvst(temp1, dst_argb, 0);
+ __lasx_xvst(temp2, dst_argb, 32);
+ __lasx_xvst(temp3, dst_argb, 64);
+ dst_argb += 96;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
+ vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 2);
+ g_h = __lasx_xvsrli_h(g_h, 2);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 11);
+ r_h = __lasx_xvslli_h(r_h, 11);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_rgb565, 0);
+ __lasx_xvst(dst_h, dst_rgb565, 32);
+ dst_rgb565 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000,
+ 0xF000F000F000F000, 0xF000F000F000F000};
+ __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0,
+ 0x00F000F000F000F0, 0x00F000F000F000F0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
+ vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 4);
+ b_h = __lasx_xvsrli_h(b_h, 4);
+ r_l = __lasx_xvsrli_h(r_l, 4);
+ r_h = __lasx_xvsrli_h(r_h, 4);
+ g_l = __lasx_xvand_v(g_l, mask);
+ g_h = __lasx_xvand_v(g_h, mask);
+ r_l = __lasx_xvslli_h(r_l, 8);
+ r_h = __lasx_xvslli_h(r_h, 8);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb4444, 0);
+ __lasx_xvst(dst_h, dst_argb4444, 32);
+ dst_argb4444 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = {0x8000800080008000, 0x8000800080008000,
+ 0x8000800080008000, 0x8000800080008000};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
+ vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 3);
+ g_h = __lasx_xvsrli_h(g_h, 3);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 10);
+ r_h = __lasx_xvslli_h(r_h, 10);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb1555, 0);
+ __lasx_xvst(dst_h, dst_argb1555, 32);
+ dst_argb1555 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ dst0 = __lasx_xvpickev_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_yuy2 += 64;
+ dst_y += 32;
+ }
+}
+
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
+ src_yuy2_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickod_b(src1, src0);
+ src1 = __lasx_xvpickod_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ src_yuy2_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ tmp0 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ dst0 = __lasx_xvpickod_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_uyvy += 64;
+ dst_y += 32;
+ }
+}
+
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
+ src_uyvy_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickev_b(src1, src0);
+ src1 = __lasx_xvpickev_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ src_uyvy_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3;
+ __m256i tmp0, tmp1, dst0;
+ __m256i const_19 = __lasx_xvldi(0x19);
+ __m256i const_42 = __lasx_xvldi(0x42);
+ __m256i const_81 = __lasx_xvldi(0x81);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+ __m256i control = {0x0000000400000000, 0x0000000500000001,
+ 0x0000000600000002, 0x0000000700000003};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+ src_argb0, 96, src0, src1, src2, src3);
+ vec0 = __lasx_xvpickev_b(src1, src0);
+ vec1 = __lasx_xvpickev_b(src3, src2);
+ vec2 = __lasx_xvpickod_b(src1, src0);
+ vec3 = __lasx_xvpickod_b(src3, src2);
+ tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19);
+ tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19);
+ tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81);
+ tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81);
+ tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42);
+ tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42);
+ dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_argb0 += 128;
+ dst_y += 32;
+ }
+}
+
+void ARGBToUVRow_LASX(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+ __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m256i vec0, vec1, vec2, vec3;
+ __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+ __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
+ 0x0038003800380038, 0x0038003800380038};
+ __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
+ 0x0025002500250025, 0x0025002500250025};
+ __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
+ 0x0013001300130013, 0x0013001300130013};
+ __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
+ 0x002f002f002f002f, 0x002f002f002f002f};
+ __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
+ 0x0009000900090009, 0x0009000900090009};
+ __m256i control = {0x0000000400000000, 0x0000000500000001,
+ 0x0000000600000002, 0x0000000700000003};
+ __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+ src_argb0, 96, src0, src1, src2, src3);
+ DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
+ src_argb1, 96, src4, src5, src6, src7);
+ vec0 = __lasx_xvaddwev_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwev_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwev_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwev_h_bu(src3, src7);
+ tmp0 = __lasx_xvpickev_h(vec1, vec0);
+ tmp1 = __lasx_xvpickev_h(vec3, vec2);
+ tmp2 = __lasx_xvpickod_h(vec1, vec0);
+ tmp3 = __lasx_xvpickod_h(vec3, vec2);
+ vec0 = __lasx_xvaddwod_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwod_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwod_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwod_h_bu(src3, src7);
+ tmp4 = __lasx_xvpickev_h(vec1, vec0);
+ tmp5 = __lasx_xvpickev_h(vec3, vec2);
+ vec0 = __lasx_xvpickev_h(tmp1, tmp0);
+ vec1 = __lasx_xvpickod_h(tmp1, tmp0);
+ src0 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp3, tmp2);
+ vec1 = __lasx_xvpickod_h(tmp3, tmp2);
+ src1 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp5, tmp4);
+ vec1 = __lasx_xvpickod_h(tmp5, tmp4);
+ src2 = __lasx_xvavgr_h(vec0, vec1);
+ dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
+ dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
+ dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
+ dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
+ dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
+ dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
+ dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_argb0 += 128;
+ src_argb1 += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A,
+ 0x0908060504020100, 0x000000000E0D0C0A};
+ __m256i control = {0x0000000100000000, 0x0000000400000002,
+ 0x0000000600000005, 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64,
+ src_argb, 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64,
+ src_argb, 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08,
+ 0x090A040506000102, 0x000000000C0D0E08};
+ __m256i control = {0x0000000100000000, 0x0000000400000002,
+ 0x0000000600000005, 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64,
+ src_argb, 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64,
+ src_argb, 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i shift = {0x0300030003000300, 0x0300030003000300,
+ 0x0300030003000300, 0x0300030003000300};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvpackev_b(zero, tmp1);
+ tmp1 = __lasx_xvsrli_h(tmp1, 2);
+ tmp0 = __lasx_xvsll_b(tmp0, shift);
+ tmp1 = __lasx_xvslli_h(tmp1, 5);
+ dst0 = __lasx_xvor_v(tmp0, tmp1);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m256i shift1 = {0x0703070307030703, 0x0703070307030703,
+ 0x0703070307030703, 0x0703070307030703};
+ __m256i shift2 = {0x0200020002000200, 0x0200020002000200,
+ 0x0200020002000200, 0x0200020002000200};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvsrl_b(tmp1, shift1);
+ tmp0 = __lasx_xvsll_b(tmp0, shift2);
+ tmp2 = __lasx_xvpackev_b(zero, tmp1);
+ tmp3 = __lasx_xvpackod_b(zero, tmp1);
+ tmp2 = __lasx_xvslli_h(tmp2, 5);
+ tmp3 = __lasx_xvslli_h(tmp3, 15);
+ dst0 = __lasx_xvor_v(tmp0, tmp2);
+ dst0 = __lasx_xvor_v(dst0, tmp3);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 4);
+ dst0 = __lasx_xvor_v(tmp1, tmp0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, reg3, dst0, dst1;
+ __m256i const_112 = __lasx_xvldi(112);
+ __m256i const_74 = __lasx_xvldi(74);
+ __m256i const_38 = __lasx_xvldi(38);
+ __m256i const_94 = __lasx_xvldi(94);
+ __m256i const_18 = __lasx_xvldi(18);
+ __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i control = {0x0000000400000000, 0x0000000500000001,
+ 0x0000000600000002, 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64,
+ src_argb, 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvpickev_h(src1, src0);
+ tmp1 = __lasx_xvpickod_h(src1, src0);
+ tmp2 = __lasx_xvpickev_h(src3, src2);
+ tmp3 = __lasx_xvpickod_h(src3, src2);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
+ reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
+ reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
+ reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
+ reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
+ reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
+ reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
+ reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
+ reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ __lasx_xvst(dst0, dst_u, 0);
+ __lasx_xvst(dst1, dst_v, 0);
+ dst_u += 32;
+ dst_v += 32;
+ src_argb += 128;
+ }
+}
+
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, dst0, dst1;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp2 = __lasx_xvilvl_b(zero, src1);
+ tmp3 = __lasx_xvilvh_b(zero, src1);
+ dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
+ dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
+ dst0 = __lasx_xvpickev_b(dst1, dst0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvsadd_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvssub_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m256i b, g, r, a, dst0, dst1;
+ __m256i control = {0x0005000100040000, 0x0007000300060002,
+ 0x0005000100040000, 0x0007000300060002};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(tmp0, tmp0);
+ r = __lasx_xvpackod_b(tmp0, tmp0);
+ g = __lasx_xvpackev_b(tmp1, tmp1);
+ a = __lasx_xvpackod_b(tmp1, tmp1);
+ reg0 = __lasx_xvmulwev_w_hu(b, a);
+ reg1 = __lasx_xvmulwod_w_hu(b, a);
+ reg2 = __lasx_xvmulwev_w_hu(r, a);
+ reg3 = __lasx_xvmulwod_w_hu(r, a);
+ reg4 = __lasx_xvmulwev_w_hu(g, a);
+ reg5 = __lasx_xvmulwod_w_hu(g, a);
+ reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
+ reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
+ reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
+ reg0 = __lasx_xvshuf_h(control, reg0, reg0);
+ reg2 = __lasx_xvshuf_h(control, reg2, reg2);
+ reg4 = __lasx_xvshuf_h(control, reg4, reg4);
+ tmp0 = __lasx_xvpackev_b(reg4, reg0);
+ tmp1 = __lasx_xvpackev_b(a, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ src_argb += 64;
+ }
+}
+
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i b, g, r;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
+
+ vec_dither = __lasx_xvilvl_b(zero, vec_dither);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(zero, tmp0);
+ r = __lasx_xvpackod_b(zero, tmp0);
+ g = __lasx_xvpackev_b(zero, tmp1);
+ b = __lasx_xvadd_h(b, vec_dither);
+ g = __lasx_xvadd_h(g, vec_dither);
+ r = __lasx_xvadd_h(r, vec_dither);
+ DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
+ r = __lasx_xvclip255_h(r);
+ b = __lasx_xvsrai_h(b, 3);
+ g = __lasx_xvsrai_h(g, 2);
+ r = __lasx_xvsrai_h(r, 3);
+ g = __lasx_xvslli_h(g, 5);
+ r = __lasx_xvslli_h(r, 11);
+ dst0 = __lasx_xvor_v(b, g);
+ dst0 = __lasx_xvor_v(dst0, r);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ src_argb += 64;
+ dst_rgb += 32;
+ }
+}
+
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, dst0, dst1;
+ __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808,
+ 0x0404040400000000, 0x0C0C0C0C08080808};
+ __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
+
+ shuf = __lasx_xvadd_b(shuf, temp);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ dst0 = __lasx_xvshuf_b(src0, src0, shuf);
+ dst1 = __lasx_xvshuf_b(src1, src1, shuf);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ int x;
+ int len = width / 8;
+ __m256i src0, dst0, tmp0, tmp1;
+ __m256i vec_value = __lasx_xvreplgr2vr_w(value);
+
+ vec_value = __lasx_xvilvl_b(vec_value, vec_value);
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb, 0);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
+ tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
+ dst0 = __lasx_xvpickod_b(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, dst0, dst1;
+ __m256i const_128 = __lasx_xvldi(0x480);
+ __m256i const_150 = __lasx_xvldi(0x96);
+ __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
+ 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
+ reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
+ reg2 = __lasx_xvadd_h(reg0, reg1);
+ tmp0 = __lasx_xvpackod_b(reg2, reg2);
+ tmp1 = __lasx_xvpackod_b(tmp1, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, spb, spg, spr;
+ __m256i dst0, dst1;
+ __m256i spb_g = __lasx_xvldi(68);
+ __m256i spg_g = __lasx_xvldi(88);
+ __m256i spr_g = __lasx_xvldi(98);
+ __m256i spb_br = {0x2311231123112311, 0x2311231123112311,
+ 0x2311231123112311, 0x2311231123112311};
+ __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16,
+ 0x2D162D162D162D16, 0x2D162D162D162D16};
+ __m256i spr_br = {0x3218321832183218, 0x3218321832183218,
+ 0x3218321832183218, 0x3218321832183218};
+ __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908,
+ 0x1706150413021100, 0x1F0E1D0C1B0A1908};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+ spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
+ spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
+ spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
+ spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
+ spb = __lasx_xvsrli_h(spb, 7);
+ spg = __lasx_xvsrli_h(spg, 7);
+ spr = __lasx_xvsrli_h(spr, 7);
+ spg = __lasx_xvsat_hu(spg, 7);
+ spr = __lasx_xvsat_hu(spr, 7);
+ reg0 = __lasx_xvpackev_b(spg, spb);
+ reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv