diff options
author | Lu Wang <wanglu@loongson.cn> | 2023-05-19 16:46:54 +0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-19 18:55:58 +0000 |
commit | 8670bcf17faca69603d2d49fa6ebd0e45123471f (patch) | |
tree | d598dcc28bab36d2bc2db58fc8b0ece69458318b /source | |
parent | a37799344d29dc8e4d3fb00ced5f07e5ce8bf1b9 (diff) | |
download | libyuv-8670bcf17faca69603d2d49fa6ebd0e45123471f.tar.gz |
Optimize the following 19 functions with LSX in row_lsx.cc.
UYVYToYRow_LSX, UYVYToUVRow_LSX, UYVYToUV422Row_LSX,
ARGBToUVRow_LSX, ARGBToRGB24Row_LSX, ARGBToRAWRow_LSX,
ARGBToRGB565Row_LSX, ARGBToARGB1555Row_LSX, ARGBToARGB4444Row_LSX,
ARGBToUV444Row_LSX, ARGBMultiplyRow_LSX, ARGBAddRow_LSX,
ARGBSubtractRow_LSX, ARGBAttenuateRow_LSX, ARGBToRGB565DitherRow_LSX,
ARGBShuffleRow_LSX, ARGBShadeRow_LSX, ARGBGrayRow_LSX,
ARGBSepiaRow_LSX
Bug: libyuv:913
Change-Id: I02c0c9d68b229c4a66c96837e9b928c2f5dda1f3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4546814
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source')
-rw-r--r-- | source/convert.cc | 40 | ||||
-rw-r--r-- | source/convert_argb.cc | 8 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 97 | ||||
-rw-r--r-- | source/planar_functions.cc | 78 | ||||
-rw-r--r-- | source/row_any.cc | 41 | ||||
-rw-r--r-- | source/row_lsx.cc | 570 |
6 files changed, 834 insertions, 0 deletions
diff --git a/source/convert.cc b/source/convert.cc index 30113389..075428d0 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1558,6 +1558,26 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif #if defined(HAS_UYVYTOYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { UYVYToYRow = UYVYToYRow_Any_LASX; @@ -1818,6 +1838,16 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -3610,6 +3640,16 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 4dfc5ecf..99bbb96d 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -5742,6 +5742,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index eae3ea0d..6e05876a 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -76,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToUV444Row = ARGBToUV444Row_Any_LASX; @@ -251,6 +259,16 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -556,6 +574,16 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1105,6 +1133,16 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1289,6 +1327,16 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1540,6 +1588,14 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX; @@ -1619,6 +1675,14 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORAWROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRAWRow = ARGBToRAWRow_Any_LASX; @@ -1702,6 +1766,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; @@ -1779,6 +1851,15 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_LSX; + } + } +#endif + #if defined(HAS_ARGBTORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX; @@ -1853,6 +1934,14 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOARGB1555ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX; @@ -1927,6 +2016,14 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOARGB4444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fd12718d..e741dc50 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2219,6 +2219,16 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUV422Row = UYVYToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUV422Row = UYVYToUV422Row_LSX; + } + } +#endif #if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { UYVYToYRow = UYVYToYRow_Any_LASX; @@ -2366,6 +2376,14 @@ int UYVYToY(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToYRow(src_uyvy, dst_y, width); @@ -3068,6 +3086,14 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_LSX; + } + } +#endif #if defined(HAS_ARGBMULTIPLYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX; @@ -3153,6 +3179,14 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAddRow = ARGBAddRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_LSX; + } + } +#endif #if defined(HAS_ARGBADDROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBAddRow = ARGBAddRow_Any_LASX; @@ -3233,6 +3267,14 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_LSX; + } + } +#endif #if defined(HAS_ARGBSUBTRACTROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBSubtractRow = ARGBSubtractRow_Any_LASX; @@ -3558,6 +3600,14 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; @@ -3671,6 +3721,11 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif #if defined(HAS_ARGBGRAYROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBGrayRow = ARGBGrayRow_LASX; @@ -3721,6 +3776,11 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif #if defined(HAS_ARGBGRAYROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBGrayRow = ARGBGrayRow_LASX; @@ -3769,6 +3829,11 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif +#if defined(HAS_ARGBSEPIAROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_LSX; + } +#endif #if defined(HAS_ARGBSEPIAROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBSepiaRow = ARGBSepiaRow_LASX; @@ -4194,6 +4259,11 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif +#if defined(HAS_ARGBSHADEROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_LSX; + } +#endif #if defined(HAS_ARGBSHADEROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_LASX; @@ -4483,6 +4553,14 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_LSX; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBShuffleRow = ARGBShuffleRow_Any_LASX; diff --git a/source/row_any.cc b/source/row_any.cc index 27b12a7a..e574543c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -639,18 +639,27 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif +#ifdef HAS_ARGBMULTIPLYROW_LSX +ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBMULTIPLYROW_LASX ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBADDROW_LSX +ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBADDROW_LASX ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBSUBTRACTROW_LSX +ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBSUBTRACTROW_LASX ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) #endif @@ -992,6 +1001,13 @@ ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) #endif +#if defined(HAS_ARGBTORGB24ROW_LSX) +ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7) +#endif #if defined(HAS_ARGBTORGB24ROW_LASX) ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31) ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31) @@ -1230,6 +1246,9 @@ ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_UYVYTOYROW_LSX +ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_UYVYTOYROW_LASX ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) #endif @@ -1326,6 +1345,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_LSX +ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7) +#endif #ifdef HAS_ARGBATTENUATEROW_LASX ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) #endif @@ -1467,6 +1489,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA, 2, 7) #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) +ANY11P(ARGBToRGB565DitherRow_Any_LSX, + ARGBToRGB565DitherRow_LSX, + const uint32_t, + 4, + 2, + 7) +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) ANY11P(ARGBToRGB565DitherRow_Any_LASX, ARGBToRGB565DitherRow_LASX, @@ -1487,6 +1517,9 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #ifdef HAS_ARGBSHUFFLEROW_MSA ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif +#ifdef HAS_ARGBSHUFFLEROW_LSX +ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7) +#endif #ifdef HAS_ARGBSHUFFLEROW_LASX ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) #endif @@ -2000,7 +2033,9 @@ ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif #ifdef HAS_YUY2TOUV422ROW_LSX +ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_LASX ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) @@ -2172,6 +2207,9 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVROW_LSX +ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVROW_LASX ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #endif @@ -2292,6 +2330,9 @@ ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_UYVYTOUVROW_LSX +ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15) +#endif #ifdef HAS_UYVYTOUVROW_LASX ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) #endif diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 15595efe..573fc94d 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -713,6 +713,576 @@ void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, } } +void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + dst0 = __lsx_vpickod_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_uyvy += 32; + dst_y += 16; + } +} + +void UYVYToUVRow_LSX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0, + src_uyvy_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickev_b(src1, src0); + src1 = __lsx_vpickev_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + src_uyvy_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void UYVYToUV422Row_LSX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToUVRow_LSX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i vec0, vec1, vec2, vec3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038}; + __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025}; + __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013}; + __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f}; + __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009}; + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, + src_argb0, 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, + src_argb1, 48, src4, src5, src6, src7); + vec0 = __lsx_vaddwev_h_bu(src0, src4); + vec1 = __lsx_vaddwev_h_bu(src1, src5); + vec2 = __lsx_vaddwev_h_bu(src2, src6); + vec3 = __lsx_vaddwev_h_bu(src3, src7); + tmp0 = __lsx_vpickev_h(vec1, vec0); + tmp1 = __lsx_vpickev_h(vec3, vec2); + tmp2 = __lsx_vpickod_h(vec1, vec0); + tmp3 = __lsx_vpickod_h(vec3, vec2); + vec0 = __lsx_vaddwod_h_bu(src0, src4); + vec1 = __lsx_vaddwod_h_bu(src1, src5); + vec2 = __lsx_vaddwod_h_bu(src2, src6); + vec3 = __lsx_vaddwod_h_bu(src3, src7); + tmp4 = __lsx_vpickev_h(vec1, vec0); + tmp5 = __lsx_vpickev_h(vec3, vec2); + vec0 = __lsx_vpickev_h(tmp1, tmp0); + vec1 = __lsx_vpickod_h(tmp1, tmp0); + src0 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp3, tmp2); + vec1 = __lsx_vpickod_h(tmp3, tmp2); + src1 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp5, tmp4); + vec1 = __lsx_vpickod_h(tmp5, tmp4); + src2 = __lsx_vavgr_h(vec0, vec1); + dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A); + dst0 = __lsx_vmsub_h(dst0, src1, const_0x26); + dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E); + dst1 = __lsx_vmsub_h(dst1, src0, const_0x12); + dst0 = __lsx_vsrai_h(dst0, 8); + dst1 = __lsx_vsrai_h(dst1, 8); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + src_argb0 += 64; + src_argb1 += 64; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, + 48, src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, + 48, src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRGB565Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i shift = {0x0300030003000300, 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vpackev_b(zero, tmp1); + tmp1 = __lsx_vsrli_h(tmp1, 2); + tmp0 = __lsx_vsll_b(tmp0, shift); + tmp1 = __lsx_vslli_h(tmp1, 5); + dst0 = __lsx_vor_v(tmp0, tmp1); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB1555Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i shift1 = {0x0703070307030703, 0x0703070307030703}; + __m128i shift2 = {0x0200020002000200, 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vsrl_b(tmp1, shift1); + tmp0 = __lsx_vsll_b(tmp0, shift2); + tmp2 = __lsx_vpackev_b(zero, tmp1); + tmp3 = __lsx_vpackod_b(zero, tmp1); + tmp2 = __lsx_vslli_h(tmp2, 5); + tmp3 = __lsx_vslli_h(tmp3, 15); + dst0 = __lsx_vor_v(tmp0, tmp2); + dst0 = __lsx_vor_v(dst0, tmp3); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB4444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vandi_b(tmp1, 0xF0); + tmp0 = __lsx_vsrli_b(tmp0, 4); + dst0 = __lsx_vor_v(tmp1, tmp0); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToUV444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, dst0, dst1; + __m128i const_112 = __lsx_vldi(112); + __m128i const_74 = __lsx_vldi(74); + __m128i const_38 = __lsx_vldi(38); + __m128i const_94 = __lsx_vldi(94); + __m128i const_18 = __lsx_vldi(18); + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, + 48, src0, src1, src2, src3); + tmp0 = __lsx_vpickev_h(src1, src0); + tmp1 = __lsx_vpickod_h(src1, src0); + tmp2 = __lsx_vpickev_h(src3, src2); + tmp3 = __lsx_vpickod_h(src3, src2); + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lsx_vmulwod_h_bu(tmp0, const_74); + reg3 = __lsx_vmulwod_h_bu(tmp2, const_74); + reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst0 = __lsx_vpickev_b(reg1, reg0); + + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lsx_vmulwev_h_bu(tmp0, const_18); + reg3 = __lsx_vmulwev_h_bu(tmp2, const_18); + reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst1 = __lsx_vpickev_b(reg1, reg0); + + __lsx_vst(dst0, dst_u, 0); + __lsx_vst(dst1, dst_v, 0); + dst_u += 16; + dst_v += 16; + src_argb += 64; + } +} + +void ARGBMultiplyRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp2 = __lsx_vilvl_b(zero, src1); + tmp3 = __lsx_vilvh_b(zero, src1); + dst0 = __lsx_vmuh_hu(tmp0, tmp2); + dst1 = __lsx_vmuh_hu(tmp1, tmp3); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vsadd_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBSubtractRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vssub_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAttenuateRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i b, g, r, a, dst0, dst1; + __m128i control = {0x0005000100040000, 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(tmp0, tmp0); + r = __lsx_vpackod_b(tmp0, tmp0); + g = __lsx_vpackev_b(tmp1, tmp1); + a = __lsx_vpackod_b(tmp1, tmp1); + reg0 = __lsx_vmulwev_w_hu(b, a); + reg1 = __lsx_vmulwod_w_hu(b, a); + reg2 = __lsx_vmulwev_w_hu(r, a); + reg3 = __lsx_vmulwod_w_hu(r, a); + reg4 = __lsx_vmulwev_w_hu(g, a); + reg5 = __lsx_vmulwod_w_hu(g, a); + reg0 = __lsx_vssrani_h_w(reg1, reg0, 24); + reg2 = __lsx_vssrani_h_w(reg3, reg2, 24); + reg4 = __lsx_vssrani_h_w(reg5, reg4, 24); + reg0 = __lsx_vshuf_h(control, reg0, reg0); + reg2 = __lsx_vshuf_h(control, reg2, reg2); + reg4 = __lsx_vshuf_h(control, reg4, reg4); + tmp0 = __lsx_vpackev_b(reg4, reg0); + tmp1 = __lsx_vpackev_b(a, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + src_argb += 32; + } +} + +void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i b, g, r; + __m128i zero = __lsx_vldi(0); + __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0); + + vec_dither = __lsx_vilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(zero, tmp0); + r = __lsx_vpackod_b(zero, tmp0); + g = __lsx_vpackev_b(zero, tmp1); + b = __lsx_vadd_h(b, vec_dither); + g = __lsx_vadd_h(g, vec_dither); + r = __lsx_vadd_h(r, vec_dither); + DUP2_ARG1(__lsx_vclip255_h, b, g, b, g); + r = __lsx_vclip255_h(r); + b = __lsx_vsrai_h(b, 3); + g = __lsx_vsrai_h(g, 2); + r = __lsx_vsrai_h(r, 3); + g = __lsx_vslli_h(g, 5); + r = __lsx_vslli_h(r, 11); + dst0 = __lsx_vor_v(b, g); + dst0 = __lsx_vor_v(dst0, r); + __lsx_vst(dst0, dst_rgb, 0); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, dst0, dst1; + __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808}; + __m128i temp = __lsx_vldrepl_w(shuffler, 0); + + shuf = __lsx_vadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vshuf_b(src0, src0, shuf); + dst1 = __lsx_vshuf_b(src1, src1, shuf); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 4; + __m128i src0, dst0, tmp0, tmp1; + __m128i vec_value = __lsx_vreplgr2vr_w(value); + + vec_value = __lsx_vilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb, 0); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp0 = __lsx_vmuh_hu(tmp0, vec_value); + tmp1 = __lsx_vmuh_hu(tmp1, vec_value); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, dst0, dst1; + __m128i const_128 = __lsx_vldi(0x480); + __m128i const_150 = __lsx_vldi(0x96); + __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + reg0 = __lsx_vdp2_h_bu(tmp0, const_br); + reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lsx_vadd_h(reg0, reg1); + tmp0 = __lsx_vpackod_b(reg2, reg2); + tmp1 = __lsx_vpackod_b(tmp1, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, spb, spg, spr; + __m128i dst0, dst1; + __m128i spb_g = __lsx_vldi(68); + __m128i spg_g = __lsx_vldi(88); + __m128i spr_g = __lsx_vldi(98); + __m128i spb_br = {0x2311231123112311, 0x2311231123112311}; + __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16}; + __m128i spr_br = {0x3218321832183218, 0x3218321832183218}; + __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lsx_vdp2_h_bu(tmp0, spr_br); + spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lsx_vsrli_h(spb, 7); + spg = __lsx_vsrli_h(spg, 7); + spr = __lsx_vsrli_h(spr, 7); + spg = __lsx_vsat_hu(spg, 7); + spr = __lsx_vsat_hu(spr, 7); + reg0 = __lsx_vpackev_b(spg, spb); + reg1 = __lsx_vshuf_b(tmp1, spr, shuff); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + } +} + void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { |