diff options
author | Hao Chen <chenhao@loongson.cn> | 2022-02-24 13:39:55 +0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2022-03-09 08:52:54 +0000 |
commit | 91bae707e100c2e834ccd14e41704202877d8680 (patch) | |
tree | e07c71fcfd5e3eb9389a96345e635738c1faca03 | |
parent | 42d76a342f9f0775d5f5fd47f7ef1a9ba6444074 (diff) | |
download | libyuv-91bae707e100c2e834ccd14e41704202877d8680.tar.gz |
Optimize functions for LASX in row_lasx.cc.
1. Optimize 18 functions in source/row_lasx.cc file.
2. Make small modifications to LSX.
3. Remove some unnecessary content.
Bug: libyuv:912
Change-Id: Ifd1d85366efb9cdb3b99491e30fa450ff1848661
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3507640
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/cpu_id.h | 2 | ||||
-rw-r--r-- | include/libyuv/loongson_intrinsics.h | 307 | ||||
-rw-r--r-- | include/libyuv/row.h | 140 | ||||
-rw-r--r-- | source/convert.cc | 77 | ||||
-rw-r--r-- | source/convert_argb.cc | 64 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 20 | ||||
-rw-r--r-- | source/cpu_id.cc | 2 | ||||
-rw-r--r-- | source/planar_functions.cc | 8 | ||||
-rw-r--r-- | source/row_any.cc | 54 | ||||
-rw-r--r-- | source/row_lasx.cc | 949 | ||||
-rw-r--r-- | source/row_lsx.cc | 16 |
11 files changed, 1466 insertions, 173 deletions
diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index caac7640..fb90c6c7 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -78,8 +78,6 @@ LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); LIBYUV_API int MipsCpuCaps(const char* cpuinfo_name); -LIBYUV_API -int LoongarchCpuCaps(void); // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. diff --git a/include/libyuv/loongson_intrinsics.h b/include/libyuv/loongson_intrinsics.h index d6cb7a06..79b5d0e4 100644 --- a/include/libyuv/loongson_intrinsics.h +++ b/include/libyuv/loongson_intrinsics.h @@ -18,7 +18,7 @@ * Xiwei Gu <guxiwei-hf@loongson.cn> * Lu Wang <wanglu@loongson.cn> * - * This file is a header file for loongarch builtin extention. + * This file is a header file for loongarch builtin extension. * */ @@ -27,12 +27,12 @@ /** * MAJOR version: Macro usage changes. - * MINOR version: Add new functions, or bug fix. + * MINOR version: Add new functions, or bug fixes. * MICRO version: Comment changes or implementation changes. */ #define LSOM_VERSION_MAJOR 1 -#define LSOM_VERSION_MINOR 0 -#define LSOM_VERSION_MICRO 3 +#define LSOM_VERSION_MINOR 1 +#define LSOM_VERSION_MICRO 0 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ { \ @@ -79,11 +79,11 @@ * Description : Dot product & addition of byte vector elements * Arguments : Inputs - in_c, in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. - * Then the results plus to signed half word elements from in_c. + * Then the results plus to signed half-word elements from in_c. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) * in_c : 1,2,3,4, 1,2,3,4 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 @@ -91,8 +91,7 @@ * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, - __m128i in_h, +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l) { __m128i out; @@ -106,20 +105,19 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, * Description : Dot product & addition of byte vector elements * Arguments : Inputs - in_c, in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * unsigned byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. - * The results plus to signed half word elements from in_c. - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * The results plus to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) * in_c : 1,2,3,4, 1,2,3,4 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, - __m128i in_h, +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l) { __m128i out; @@ -130,12 +128,38 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, /* * ============================================================================= - * Description : Dot product & addition of half word vector elements + * Description : Dot product & addition of byte vector elements * Arguments : Inputs - in_c, in_h, in_l * Outputs - out - * Retrun Type - __m128i - * Details : Signed half word elements from in_h are multiplied by - * signed half word elements from in_l, and then added adjacent to + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get results with the twice size of input. + * The results plus to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) + * in_c : 1,1,1,1, 1,1,1,1 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 + * out : -4,-24,-60,-112, 6,26,62,114 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of half-word vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - __m128i + * Details : Signed half-word elements from in_h are multiplied by + * signed half-word elements from in_l, and then added adjacent to * each other to get results with the twice size of input. * Then the results plus to signed word elements from in_c. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) @@ -145,8 +169,7 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, * out : 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, - __m128i in_h, +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l) { __m128i out; @@ -160,7 +183,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, * Description : Dot product of byte vector elements * Arguments : Inputs - in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. @@ -183,7 +206,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { * Description : Dot product of byte vector elements * Arguments : Inputs - in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * unsigned byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. @@ -206,7 +229,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { * Description : Dot product of byte vector elements * Arguments : Inputs - in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Unsigned byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. @@ -229,7 +252,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { * Description : Dot product of byte vector elements * Arguments : Inputs - in_h, in_l * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Signed byte elements from in_h are multiplied by * signed byte elements from in_l, and then added adjacent to * each other to get results with the twice size of input. @@ -251,7 +274,8 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { * ============================================================================= * Description : Clip all halfword elements of input vector between min & max * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : - * (_in)) Arguments : Inputs - _in (input vector) + * (_in)) + * Arguments : Inputs - _in (input vector) * - min (min threshold) * - max (max threshold) * Outputs - out (output vector with clipped elements) @@ -276,7 +300,7 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { * Description : Set each element of vector between 0 and 255 * Arguments : Inputs - _in * Outputs - out - * Retrun Type - halfword + * Return Type - halfword * Details : Signed byte elements from _in are clamped between 0 and 255. * Example : out = __lsx_vclip255_h(_in) * _in : -8,255,280,249, -8,255,280,249 @@ -296,7 +320,7 @@ static inline __m128i __lsx_vclip255_h(__m128i _in) { * Description : Set each element of vector between 0 and 255 * Arguments : Inputs - _in * Outputs - out - * Retrun Type - word + * Return Type - word * Details : Signed byte elements from _in are clamped between 0 and 255. * Example : out = __lsx_vclip255_w(_in) * _in : -8,255,280,249 @@ -363,16 +387,18 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { * Description : Transpose 8x8 block with byte elements in vectors * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 Details : The rows of the matrix become columns, and the columns - * become rows. Example : LSX_TRANSPOSE8x8_B _in0 : 00,01,02,03,04,05,06,07, - * 00,00,00,00,00,00,00,00 _in1 : 10,11,12,13,14,15,16,17, - * 00,00,00,00,00,00,00,00 _in2 : 20,21,22,23,24,25,26,27, - * 00,00,00,00,00,00,00,00 _in3 : 30,31,32,33,34,35,36,37, - * 00,00,00,00,00,00,00,00 _in4 : 40,41,42,43,44,45,46,47, - * 00,00,00,00,00,00,00,00 _in5 : 50,51,52,53,54,55,56,57, - * 00,00,00,00,00,00,00,00 _in6 : 60,61,62,63,64,65,66,67, - * 00,00,00,00,00,00,00,00 _in7 : 70,71,72,73,74,75,76,77, - * 00,00,00,00,00,00,00,00 + * _out7 + * Details : The rows of the matrix become columns, and the columns + * become rows. + * Example : LSX_TRANSPOSE8x8_B + * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 * * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 @@ -388,8 +414,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ _out7) \ { \ - __m128i zero = {0}; \ - __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \ + __m128i zero = { 0 }; \ + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ \ _t0 = __lsx_vilvl_b(_in2, _in0); \ @@ -412,7 +438,7 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { /* * ============================================================================= - * Description : Transpose 8x8 block with half word elements in vectors + * Description : Transpose 8x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 * Details : @@ -467,15 +493,16 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) * Return Type - as per RTYPE * Details : The rows of the matrix become columns, and the columns become - * rows. Example : LSX_TRANSPOSE8x4_B _in0 : 00,01,02,03,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in1 : 10,11,12,13,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in2 : 20,21,22,23,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in3 : 30,31,32,33,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in4 : 40,41,42,43,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in5 : 50,51,52,53,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in6 : 60,61,62,63,00,00,00,00, - * 00,00,00,00,00,00,00,00 _in7 : 70,71,72,73,00,00,00,00, - * 00,00,00,00,00,00,00,00 + * rows. + * Example : LSX_TRANSPOSE8x4_B + * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 * * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 @@ -705,7 +732,7 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { * Details : Signed byte elements from in_h are multiplied with * signed byte elements from in_l producing a result * twice the size of input i.e. signed halfword. - * Then this iniplication results of adjacent odd-even elements + * Then this multiplication results of adjacent odd-even elements * are added to the out vector * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) * ============================================================================= @@ -748,10 +775,10 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { * Description : Dot product of word vector elements * Arguments : Inputs - in_h, in_l * Output - out - * Retrun Type - signed double + * Return Type - signed double * Details : Signed word elements from in_h are multiplied with * signed word elements from in_l producing a result - * twice the size of input i.e. signed double word. + * twice the size of input i.e. signed double-word. * Then this multiplied results of adjacent odd-even elements * are added to the out vector. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) @@ -792,7 +819,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { * Description : Dot product & addition of byte vector elements * Arguments : Inputs - in_h, in_l * Output - out - * Retrun Type - halfword + * Return Type - halfword * Details : Signed byte elements from in_h are multiplied with * signed byte elements from in_l producing a result * twice the size of input i.e. signed halfword. @@ -801,8 +828,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -813,6 +839,52 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, /* * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then this multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then this multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= * Description : Dot product of halfword vector elements * Arguments : Inputs - in_c, in_h, in_l * Output - out @@ -829,8 +901,7 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -853,8 +924,7 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -877,8 +947,7 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -902,8 +971,7 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -932,8 +1000,7 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, * out : -7,-3,0,0, 0,-1,0,-1 * ============================================================================= */ -static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i out; @@ -949,10 +1016,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, * Arguments : Inputs - in_h, in_l * Output - out * Return Type - signed word - * Details : Signed halfword elements from in_h are iniplied with + * Details : Signed halfword elements from in_h are multiplied with * signed halfword elements from in_l producing a result * four times the size of input i.e. signed doubleword. - * Then this iniplication results of four adjacent elements + * Then this multiplication results of four adjacent elements * are added together and stored to the out vector. * Example : out = __lasx_xvdp4_d_h(in_h, in_l) * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 @@ -1134,8 +1201,7 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 * ============================================================================= */ -static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i tmp0, tmp1, out; @@ -1159,8 +1225,7 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, - __m256i in_h, +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l) { __m256i tmp0, tmp1, out; @@ -1221,22 +1286,24 @@ static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { /* * ============================================================================= - * Description : The low half of the vector elements are expanded and - * added saturately after being doubled. + * Description : The low half of the vector elements are added to the high half + * after being doubled, then saturated. * Arguments : Inputs - in_h, in_l * Output - out - * Details : The in_h vector adds the in_l vector saturately after the lower - * half of the two-fold zero extension (unsigned byte to unsigned - * halfword) and the results are stored to the out vector. + * Details : The in_h vector adds the in_l vector after the lower half of + * the two-fold zero extension (unsigned byte to unsigned + * halfword) and then saturated. The results are stored to the out + * vector. * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, - * 0,0,0,1 out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, + * 0,0,0,1 + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, * ============================================================================= */ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { __m256i tmp1, out; - __m256i zero = {0}; + __m256i zero = { 0 }; tmp1 = __lasx_xvilvl_b(zero, in_l); out = __lasx_xvsadd_hu(in_h, tmp1); @@ -1308,8 +1375,8 @@ static inline __m256i __lasx_xvclip255_w(__m256i in) { /* * ============================================================================= * Description : Indexed halfword element values are replicated to all - * elements in output vector. If 'indx < 8' use xvsplati_l_*, - * if 'indx >= 8' use xvsplati_h_*. + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. * Arguments : Inputs - in, idx * Output - out * Details : Idx element value from in vector is replicated to all @@ -1332,8 +1399,8 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { /* * ============================================================================= * Description : Indexed halfword element values are replicated to all - * elements in output vector. If 'indx < 8' use xvsplati_l_*, - * if 'indx >= 8' use xvsplati_h_*. + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. * Arguments : Inputs - in, idx * Output - out * Details : Idx element value from in vector is replicated to all @@ -1355,7 +1422,7 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { /* * ============================================================================= - * Description : Transpose 4x4 block with double word elements in vectors + * Description : Transpose 4x4 block with double-word elements in vectors * Arguments : Inputs - _in0, _in1, _in2, _in3 * Outputs - _out0, _out1, _out2, _out3 * Example : LASX_TRANSPOSE4x4_D @@ -1389,10 +1456,16 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * Description : Transpose 8x8 block with word elements in vectors * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 Example : LASX_TRANSPOSE8x8_W _in0 : 1,2,3,4,5,6,7,8 _in1 : - * 2,2,3,4,5,6,7,8 _in2 : 3,2,3,4,5,6,7,8 _in3 : 4,2,3,4,5,6,7,8 _in4 : - * 5,2,3,4,5,6,7,8 _in5 : 6,2,3,4,5,6,7,8 _in6 : 7,2,3,4,5,6,7,8 _in7 : - * 8,2,3,4,5,6,7,8 + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * _in0 : 1,2,3,4,5,6,7,8 + * _in1 : 2,2,3,4,5,6,7,8 + * _in2 : 3,2,3,4,5,6,7,8 + * _in3 : 4,2,3,4,5,6,7,8 + * _in4 : 5,2,3,4,5,6,7,8 + * _in5 : 6,2,3,4,5,6,7,8 + * _in6 : 7,2,3,4,5,6,7,8 + * _in7 : 8,2,3,4,5,6,7,8 * * _out0 : 1,2,3,4,5,6,7,8 * _out1 : 2,2,2,2,2,2,2,2 @@ -1445,8 +1518,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 * (input 16x8 byte block) * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x16 byte block) Details : The rows of the matrix become - * columns, and the columns become rows. Example : See LASX_TRANSPOSE16x8_H + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE16x8_H * ============================================================================= */ #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ @@ -1498,13 +1573,20 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 * (input 16x8 byte block) * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x16 byte block) Details : The rows of the matrix become - * columns, and the columns become rows. Example : LASX_TRANSPOSE16x8_H _in0 - * : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in2 - * : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in4 - * : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in6 - * : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in8 - * : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE16x8_H + * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 @@ -1597,7 +1679,8 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * Outputs - _out0, _out1, _out2, _out3 * Return Type - signed halfword * Details : The rows of the matrix become columns, and the columns become - * rows. Example : See LASX_TRANSPOSE8x8_H + * rows. + * Example : See LASX_TRANSPOSE8x8_H * ============================================================================= */ #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ @@ -1619,7 +1702,8 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 * (input 8x8 byte block) * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, - * _out7 (output 8x8 byte block) Example : See LASX_TRANSPOSE8x8_H + * _out7 (output 8x8 byte block) + * Example : See LASX_TRANSPOSE8x8_H * ============================================================================= */ #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ @@ -1652,11 +1736,16 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * Arguments : Inputs - _in0, _in1, ~ * Outputs - _out0, _out1, ~ * Details : The rows of the matrix become columns, and the columns become - * rows. Example : LASX_TRANSPOSE8x8_H _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, - * 5,6,7,8 _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 _in2 : 8,2,3,4, 5,6,7,8, - * 8,2,3,4, 5,6,7,8 _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 _in4 : 9,2,3,4, - * 5,6,7,8, 9,2,3,4, 5,6,7,8 _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 _in6 : - * 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * rows. + * Example : LASX_TRANSPOSE8x8_H + * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 * * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 @@ -1832,14 +1921,12 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * VP:1,2,3,4, * ============================================================================= */ -#define VECT_PRINT(RTYPE, element_num, in0, enter) \ - { \ - RTYPE _tmp0 = (RTYPE)in0; \ - int _i = 0; \ - if (enter) \ - printf("\nVP:"); \ - for (_i = 0; _i < element_num; _i++) \ - printf("%d,", _tmp0[_i]); \ +#define VECT_PRINT(RTYPE, element_num, in0, enter) \ + { \ + RTYPE _tmp0 = (RTYPE)in0; \ + int _i = 0; \ + if (enter) printf("\nVP:"); \ + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ } #endif /* LOONGSON_INTRINSICS_H */ diff --git a/include/libyuv/row.h b/include/libyuv/row.h index dc9a35b4..d611b8b3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -668,9 +668,6 @@ extern "C" { #define HAS_SPLITUVROW_LSX #define HAS_SETROW_LSX #define HAS_MIRRORSPLITUVROW_LSX -#define HAS_SOBELXROW_LSX -#define HAS_SOBELYROW_LSX -#define HAS_HALFFLOATROW_LSX #endif #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) @@ -709,6 +706,24 @@ extern "C" { #define HAS_ARGBSHADEROW_LASX #define HAS_ARGBGRAYROW_LASX #define HAS_ARGBSEPIAROW_LASX +#define HAS_ARGB4444TOARGBROW_LASX +#define HAS_ARGB1555TOARGBROW_LASX +#define HAS_RGB565TOARGBROW_LASX +#define HAS_RGB24TOARGBROW_LASX +#define HAS_RAWTOARGBROW_LASX +#define HAS_ARGB1555TOYROW_LASX +#define HAS_ARGB1555TOUVROW_LASX +#define HAS_RGB565TOYROW_LASX +#define HAS_RGB565TOUVROW_LASX +#define HAS_RGB24TOYROW_LASX +#define HAS_RGB24TOUVROW_LASX +#define HAS_RAWTOYROW_LASX +#define HAS_RAWTOUVROW_LASX +#define HAS_NV12TOARGBROW_LASX +#define HAS_NV12TORGB565ROW_LASX +#define HAS_NV21TOARGBROW_LASX +#define HAS_ARGBTOYJROW_LASX +#define HAS_ARGBTOUVJROW_LASX #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1090,16 +1105,31 @@ void NV12ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void NV12ToRGB565Row_LSX(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); +void NV12ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); void NV21ToARGBRow_LSX(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void NV21ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -1136,6 +1166,7 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1278,26 +1309,51 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_LASX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB565ToUVRow_LSX(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RAWToUVRow_LSX(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width); +void RAWToUVRow_LASX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1324,9 +1380,13 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -1390,11 +1450,19 @@ void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, @@ -1618,26 +1686,51 @@ void ARGBToUVJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGB1555ToUVRow_Any_LSX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGB1555ToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB565ToUVRow_Any_LSX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGB565ToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_Any_LSX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGB24ToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RAWToUVRow_Any_LSX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void RAWToUVRow_Any_LASX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -2655,10 +2748,12 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, int width); void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); @@ -2671,6 +2766,9 @@ void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); +void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); @@ -2680,6 +2778,9 @@ void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); +void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); @@ -2689,6 +2790,9 @@ void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); +void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width); @@ -2746,10 +2850,14 @@ void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, void RGB24ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToARGBRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2764,6 +2872,9 @@ void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, void RGB565ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToARGBRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2776,6 +2887,9 @@ void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, void ARGB1555ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToARGBRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2783,6 +2897,9 @@ void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, void ARGB4444ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); @@ -4419,16 +4536,31 @@ void NV12ToARGBRow_Any_LSX(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void NV12ToRGB565Row_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV12ToRGB565Row_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void NV21ToARGBRow_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV21ToARGBRow_Any_LASX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, diff --git a/source/convert.cc b/source/convert.cc index 875afb30..8f02636d 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1883,6 +1883,16 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LASX; + RGB24ToYRow = RGB24ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYRow = RGB24ToYRow_LASX; + RGB24ToUVRow = RGB24ToUVRow_LASX; + } + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYROW @@ -2205,6 +2215,16 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToUVRow = RAWToUVRow_Any_LASX; + RAWToYRow = RAWToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYRow = RAWToYRow_LASX; + RAWToUVRow = RAWToUVRow_LASX; + } + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYROW @@ -2463,7 +2483,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, int height) { int y; #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_LSX)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB565ToUVRow_C; @@ -2501,7 +2521,8 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } // MSA version does direct RGB565 to YUV. -#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX)) +#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) \ + || defined(HAS_RGB565TOYROW_LASX)) #if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToUVRow = RGB565ToUVRow_Any_MSA; @@ -2522,6 +2543,16 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif +#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToUVRow = RGB565ToUVRow_Any_LASX; + RGB565ToYRow = RGB565ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToYRow = RGB565ToYRow_LASX; + RGB565ToUVRow = RGB565ToUVRow_LASX; + } + } +#endif // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -2575,14 +2606,14 @@ int RGB565ToI420(const uint8_t* src_rgb565, #endif { #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_LSX)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_LSX)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -2600,7 +2631,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, } if (height & 1) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_LSX)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -2610,7 +2641,7 @@ int RGB565ToI420(const uint8_t* src_rgb565, #endif } #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_LSX)) + defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) free_aligned_buffer_64(row); #endif } @@ -2631,7 +2662,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, int height) { int y; #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_LSX)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB1555ToUVRow_C; @@ -2670,7 +2701,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } // MSA version does direct ARGB1555 to YUV. -#elif (defined(HAS_ARGB1555TOYROW_MSA)) +#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) \ + || defined(HAS_ARGB1555TOYROW_LASX)) #if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; @@ -2681,7 +2713,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } #endif -#elif (defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX)) +#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX; ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX; @@ -2690,6 +2722,17 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGB1555ToUVRow = ARGB1555ToUVRow_LSX; } } +#endif +#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX; + ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToYRow = ARGB1555ToYRow_LASX; + ARGB1555ToUVRow = ARGB1555ToUVRow_LASX; + } + } +#endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -2743,7 +2786,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif { #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_LSX)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); @@ -2751,7 +2794,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_LSX)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -2771,7 +2814,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } if (height & 1) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_LSX)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -2781,7 +2824,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif } #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_LSX)) + defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) free_aligned_buffer_64(row); #endif } @@ -2873,6 +2916,14 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index f23b5d11..90f6c947 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2886,6 +2886,14 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -2953,6 +2961,14 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -3079,6 +3095,14 @@ int RGB565ToARGB(const uint8_t* src_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB565ToARGBRow = RGB565ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -3154,6 +3178,14 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -3229,6 +3261,14 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -3516,6 +3556,14 @@ int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToARGBRow = NV12ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -3592,6 +3640,14 @@ int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV21ToARGBRow = NV21ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); @@ -4340,6 +4396,14 @@ int NV12ToRGB565Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_LASX; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index d8b9f7bb..89856e25 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1941,6 +1941,16 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -2054,6 +2064,16 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + ARGBToUVJRow = ARGBToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + ARGBToUVJRow = ARGBToUVJRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 39744384..9fce8d20 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -196,7 +196,7 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { #define LOONGARCH_CFG2_LSX (1 << 6) #define LOONGARCH_CFG2_LASX (1 << 7) -#if defined(__loongarch__) && defined(__linux__) +#if defined(__loongarch__) LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) { int flag = 0x0; uint32_t cfg2 = 0; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index f2f2d695..0c2ae2b1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4207,6 +4207,14 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYJRow = ARGBToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_LASX; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { diff --git a/source/row_any.cc b/source/row_any.cc index 75585128..e79d23de 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -698,6 +698,9 @@ ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) #ifdef HAS_NV12TOARGBROW_LSX ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_LASX +ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -713,6 +716,9 @@ ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) #ifdef HAS_NV21TOARGBROW_LSX ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_LASX +ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15) +#endif #ifdef HAS_NV12TORGB24ROW_NEON ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif @@ -746,6 +752,9 @@ ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) #ifdef HAS_NV12TORGB565ROW_LSX ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_LASX +ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) +#endif #undef ANY21C // Any 2 planes of 16 bit to 1 with yuvconstants @@ -998,6 +1007,9 @@ ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #ifdef HAS_ARGBTOYJROW_LSX ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYJROW_LASX +ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif @@ -1043,6 +1055,9 @@ ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #ifdef HAS_RGB24TOYROW_LSX ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) #endif +#ifdef HAS_RGB24TOYROW_LASX +ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif @@ -1061,6 +1076,9 @@ ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #ifdef HAS_RAWTOYROW_LSX ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) #endif +#ifdef HAS_RAWTOYROW_LASX +ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif @@ -1070,6 +1088,9 @@ ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) #ifdef HAS_RGB565TOYROW_LSX ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15) #endif +#ifdef HAS_RGB565TOYROW_LASX +ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif @@ -1079,6 +1100,9 @@ ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) #ifdef HAS_ARGB1555TOYROW_LSX ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15) #endif +#ifdef HAS_ARGB1555TOYROW_LASX +ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif @@ -1121,6 +1145,9 @@ ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) #ifdef HAS_RGB24TOARGBROW_LSX ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15) #endif +#ifdef HAS_RGB24TOARGBROW_LASX +ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif @@ -1133,6 +1160,9 @@ ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) #ifdef HAS_RAWTOARGBROW_LSX ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15) #endif +#ifdef HAS_RAWTOARGBROW_LASX +ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif @@ -1142,6 +1172,9 @@ ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) #ifdef HAS_RGB565TOARGBROW_LSX ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15) #endif +#ifdef HAS_RGB565TOARGBROW_LASX +ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif @@ -1151,6 +1184,9 @@ ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) #ifdef HAS_ARGB1555TOARGBROW_LSX ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB1555TOARGBROW_LASX +ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif @@ -1160,6 +1196,9 @@ ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) #ifdef HAS_ARGB4444TOARGBROW_LSX ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15) #endif +#ifdef HAS_ARGB4444TOARGBROW_LASX +ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -1936,6 +1975,9 @@ ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_LSX ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_LASX +ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif @@ -1975,6 +2017,9 @@ ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) #ifdef HAS_RGB24TOUVROW_LSX ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_LASX +ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif @@ -1987,6 +2032,9 @@ ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) #ifdef HAS_RAWTOUVROW_LSX ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_LASX +ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif @@ -1996,6 +2044,9 @@ ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) #ifdef HAS_RGB565TOUVROW_LSX ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_LASX +ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif @@ -2005,6 +2056,9 @@ ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) #ifdef HAS_ARGB1555TOUVROW_LSX ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_LASX +ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 0d43714a..12891ff5 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -23,18 +23,14 @@ extern "C" { #define ALPHA_VAL (-1) // Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ubvr, ugvg, yg, yb) \ - { \ - __m256i ub, vr, ug, vg; \ - \ - ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ - vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ - ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ - vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ - yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ - yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ - ubvr = __lasx_xvilvl_h(ub, vr); \ - ugvg = __lasx_xvilvl_h(ug, vg); \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ } // Load 32 YUV422 pixel data @@ -182,19 +178,40 @@ extern "C" { // Pack and Store 8 ARGB values. #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ { \ - __m256i temp0, temp1; \ + __m256i temp0, temp1, temp2, temp3; \ \ temp0 = __lasx_xvpackev_b(in_g, in_b); \ temp1 = __lasx_xvpackev_b(in_a, in_r); \ - in_a = __lasx_xvilvl_h(temp1, temp0); \ - in_r = __lasx_xvilvh_h(temp1, temp0); \ - temp0 = __lasx_xvpermi_q(in_r, in_a, 0x20); \ - temp1 = __lasx_xvpermi_q(in_r, in_a, 0x31); \ + temp2 = __lasx_xvilvl_h(temp1, temp0); \ + temp3 = __lasx_xvilvh_h(temp1, temp0); \ + temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \ + temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \ __lasx_xvst(temp0, pdst_argb, 0); \ __lasx_xvst(temp1, pdst_argb, 32); \ pdst_argb += 64; \ } +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ + _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ + } + + void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { int x; int len = width / 64; @@ -322,12 +339,14 @@ void I422ToARGBRow_LASX(const uint8_t* src_y, int width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i alpha = __lasx_xvldi(0xFF); __m256i const_0x80 = __lasx_xvldi(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -350,12 +369,14 @@ void I422ToRGBARow_LASX(const uint8_t* src_y, int width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i alpha = __lasx_xvldi(0xFF); __m256i const_0x80 = __lasx_xvldi(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -380,12 +401,14 @@ void I422AlphaToARGBRow_LASX(const uint8_t* src_y, int x; int len = width / 32; int res = width & 31; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i zero = __lasx_xvldi(0); __m256i const_0x80 = __lasx_xvldi(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; @@ -420,7 +443,7 @@ void I422ToRGB24Row_LASX(const uint8_t* src_y, int32_t width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i const_0x80 = __lasx_xvldi(0x80); __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614, @@ -428,7 +451,9 @@ void I422ToRGB24Row_LASX(const uint8_t* src_y, __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -467,11 +492,13 @@ void I422ToRGB565Row_LASX(const uint8_t* src_y, int width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i const_0x80 = __lasx_xvldi(0x80); - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -514,7 +541,7 @@ void I422ToARGB4444Row_LASX(const uint8_t* src_y, int width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i const_0x80 = __lasx_xvldi(0x80); __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000, @@ -522,7 +549,9 @@ void I422ToARGB4444Row_LASX(const uint8_t* src_y, __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0}; - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -564,13 +593,15 @@ void I422ToARGB1555Row_LASX(const uint8_t* src_y, int width) { int x; int len = width / 32; - __m256i vec_yb, vec_yg; + __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000, - 0x8000800080008000}; + __m256i alpha = {0x8000800080008000, 0x8000800080008000, + 0x8000800080008000, 0x8000800080008000}; - YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); for (x = 0; x < len; x++) { __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; @@ -1338,6 +1369,858 @@ void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) { } } +void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb4444, 0); + src1 = __lasx_xvld(src_argb4444, 32); + DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2); + DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3); + DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2); + DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2, + 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_argb4444 += 64; + dst_argb += 128; + } +} + +void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa; + __m256i reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + tmpa = __lasx_xvsrli_b(tmp1, 7); + tmpa = __lasx_xvneg_b(tmpa); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + dst2 = __lasx_xvilvl_h(reg3, reg2); + dst3 = __lasx_xvilvh_h(reg3, reg2); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_argb1555 += 64; + dst_argb += 128; + } +} + +void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst0 = __lasx_xvilvl_h(reg1, reg0); + dst1 = __lasx_xvilvh_h(reg1, reg0); + DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1); + dst2 = __lasx_xvilvl_h(reg1, reg0); + dst3 = __lasx_xvilvh_h(reg1, reg0); + DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2, + 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3); + __lasx_xvst(reg0, dst_argb, 0); + __lasx_xvst(reg1, dst_argb, 32); + __lasx_xvst(reg2, dst_argb, 64); + __lasx_xvst(reg3, dst_argb, 96); + src_rgb565 += 64; + dst_argb += 128; + } +} + +void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2; + __m256i dst0, dst1, dst2, dst3; + __m256i reg0, reg1, reg2, reg3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, + 0x131211100F0E0D0C, 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, + 0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, + 0x0B0A090807060504, 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, + 0x1005040310020100, 0x100B0A0910080706}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_rgb24, 0); + reg1 = __lasx_xvld(src_rgb24, 32); + reg2 = __lasx_xvld(src_rgb24, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_rgb24 += 96; + dst_argb += 128; + } +} + +void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3; + __m256i dst0, dst1, dst2, dst3; + __m256i alpha = __lasx_xvldi(0xFF); + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, + 0x131211100F0E0D0C, 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, + 0x1F1E1D1C1B1A1918, 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, + 0x0B0A090807060504, 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, + 0x1003040510000102, 0x10090A0B10060708}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_raw, 0); + reg1 = __lasx_xvld(src_raw, 32); + reg2 = __lasx_xvld(src_raw, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); + DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, + tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0, + 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3); + __lasx_xvst(dst0, dst_argb, 0); + __lasx_xvst(dst1, dst_argb, 32); + __lasx_xvst(dst2, dst_argb, 64); + __lasx_xvst(dst3, dst_argb, 96); + src_raw += 96; + dst_argb += 128; + } +} + +void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, reg2, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_argb1555, 0); + src1 = __lasx_xvld(src_argb1555, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + src_argb1555 += 64; + dst_y += 32; + } +} + +void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, + 0, next_argb1555, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + tmpg = __lasx_xvsrli_b(tmp0, 5); + nexg = __lasx_xvsrli_b(tmp2, 5); + reg0 = __lasx_xvandi_b(tmp1, 0x03); + reg2 = __lasx_xvandi_b(tmp3, 0x03); + reg0 = __lasx_xvslli_b(reg0, 3); + reg2 = __lasx_xvslli_b(reg2, 3); + tmpg = __lasx_xvor_v(tmpg, reg0); + nexg = __lasx_xvor_v(nexg, reg2); + reg1 = __lasx_xvandi_b(tmp1, 0x7C); + reg3 = __lasx_xvandi_b(tmp3, 0x7C); + tmpr = __lasx_xvsrli_b(reg1, 2); + nexr = __lasx_xvsrli_b(reg3, 2); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvslli_b(tmpg, 3); + reg2 = __lasx_xvslli_b(tmpr, 3); + tmpb = __lasx_xvsrli_b(tmpb, 2); + tmpg = __lasx_xvsrli_b(tmpg, 2); + tmpr = __lasx_xvsrli_b(tmpr, 2); + tmpb = __lasx_xvor_v(reg0, tmpb); + tmpg = __lasx_xvor_v(reg1, tmpg); + tmpr = __lasx_xvor_v(reg2, tmpr); + reg0 = __lasx_xvslli_b(nexb, 3); + reg1 = __lasx_xvslli_b(nexg, 3); + reg2 = __lasx_xvslli_b(nexr, 3); + nexb = __lasx_xvsrli_b(nexb, 2); + nexg = __lasx_xvsrli_b(nexg, 2); + nexr = __lasx_xvsrli_b(nexr, 2); + nexb = __lasx_xvor_v(reg0, nexb); + nexg = __lasx_xvor_v(reg1, nexg); + nexr = __lasx_xvor_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_argb1555 += 64; + next_argb1555 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1; + __m256i tmp0, tmp1, tmpb, tmpg, tmpr; + __m256i reg0, reg1, dst0; + __m256i const_66 = __lasx_xvldi(66); + __m256i const_129 = __lasx_xvldi(129); + __m256i const_25 = __lasx_xvldi(25); + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + + for (x = 0; x < len; x++) { + src0 = __lasx_xvld(src_rgb565, 0); + src1 = __lasx_xvld(src_rgb565, 32); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + reg0 = __lasx_xvsrli_b(tmpr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25); + reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129); + reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66); + reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66); + dst0 = __lasx_xvpackod_b(reg1, reg0); + dst0 = __lasx_xvpermi_d(dst0, 0xD8); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_rgb565 += 64; + } +} + +void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 32; + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; + __m256i src0, src1, src2, src3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i reg0, reg1, reg2, reg3, dst0; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0, + next_rgb565, 32, src0, src1, src2, src3); + DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); + tmpb = __lasx_xvandi_b(tmp0, 0x1F); + tmpr = __lasx_xvandi_b(tmp1, 0xF8); + nexb = __lasx_xvandi_b(tmp2, 0x1F); + nexr = __lasx_xvandi_b(tmp3, 0xF8); + reg1 = __lasx_xvandi_b(tmp1, 0x07); + reg3 = __lasx_xvandi_b(tmp3, 0x07); + reg0 = __lasx_xvsrli_b(tmp0, 5); + reg1 = __lasx_xvslli_b(reg1, 3); + reg2 = __lasx_xvsrli_b(tmp2, 5); + reg3 = __lasx_xvslli_b(reg3, 3); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpb, 3); + reg1 = __lasx_xvsrli_b(tmpb, 2); + reg2 = __lasx_xvslli_b(nexb, 3); + reg3 = __lasx_xvsrli_b(nexb, 2); + tmpb = __lasx_xvor_v(reg1, reg0); + nexb = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvslli_b(tmpg, 2); + reg1 = __lasx_xvsrli_b(tmpg, 4); + reg2 = __lasx_xvslli_b(nexg, 2); + reg3 = __lasx_xvsrli_b(nexg, 4); + tmpg = __lasx_xvor_v(reg1, reg0); + nexg = __lasx_xvor_v(reg2, reg3); + reg0 = __lasx_xvsrli_b(tmpr, 5); + reg2 = __lasx_xvsrli_b(nexr, 5); + tmpr = __lasx_xvor_v(tmpr, reg0); + nexr = __lasx_xvor_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + reg0 = __lasx_xvpermi_d(reg0, 0xD8); + reg1 = __lasx_xvpermi_d(reg1, 0xD8); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_rgb565 += 64; + next_rgb565 += 64; + } +} + +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, dst0; + __m256i const_129 = __lasx_xvldi(129); + __m256i const_br = {0x4219421942194219, 0x4219421942194219, + 0x4219421942194219, 0x4219421942194219}; + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, + 0x0B09080605030200, 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, + 0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, + 0x000A000700040001, 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, + 0x0002001F001C0019, 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_rgb24, 0); + reg1 = __lasx_xvld(src_rgb24, 32); + reg2 = __lasx_xvld(src_rgb24, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); + tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); + tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); + tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_rgb24 += 96; + } +} + +void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64, + next_rgb24, 0, reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, + reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_rgb24 += 96; + next_rgb24 += 96; + dst_u += 16; + dst_v += 16; + } +} + +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, reg2, dst0; + __m256i const_129 = __lasx_xvldi(129); + __m256i const_br = {0x1942194219421942, 0x1942194219421942, + 0x1942194219421942, 0x1942194219421942}; + __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, + 0x1080108010801080, 0x1080108010801080}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, + 0x0B09080605030200, 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, + 0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, + 0x000A000700040001, 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, + 0x0002001F001C0019, 0x000E000B00080005}; + + for (x = 0; x < len; x++) { + reg0 = __lasx_xvld(src_raw, 0); + reg1 = __lasx_xvld(src_raw, 32); + reg2 = __lasx_xvld(src_raw, 64); + src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); + src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); + src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); + tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); + tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); + tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); + tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); + reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); + reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_raw += 96; + } +} + +void RAWToUVRow_LASX(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_raw = src_raw + src_stride_raw; + int len = width / 32; + __m256i src0, src1, src2, reg0, reg1, reg2; + __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_112 = __lasx_xvldi(0x438); + __m256i const_74 = __lasx_xvldi(0x425); + __m256i const_38 = __lasx_xvldi(0x413); + __m256i const_94 = __lasx_xvldi(0x42F); + __m256i const_18 = __lasx_xvldi(0x409); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18, + 0x15120F0C09060300, 0x00000000001E1B18}; + __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908, + 0x0706050403020100, 0x1D1A1714110A0908}; + __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19, + 0x1613100D0A070401, 0x00000000001F1C19}; + __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908, + 0x0706050403020100, 0x1E1B1815120A0908}; + __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A, + 0x1714110E0B080502, 0x0000000000001D1A}; + __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908, + 0x0706050403020100, 0x1F1C191613100908}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, + next_raw, 0, reg0, reg1, reg2, tmp0); + DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, + reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, + tmpb, nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, + tmpg, nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, + tmpr, nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, + tmpb, nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, + tmpg, nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, + tmpr, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); + dst0 = __lasx_xvpickod_b(reg1, reg0); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 1); + __lasx_xvstelm_d(dst0, dst_u, 8, 2); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + src_raw += 96; + next_raw += 96; + dst_u += 16; + dst_v += 16; + } +} + +void NV12ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +void NV12ToRGB565Row_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_vrub, vec_vgug, vec_y, vec_vu; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub); + vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_vu = __lasx_xvld(src_uv, 0); + vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); + vec_vu = __lasx_vext2xv_h_b(vec_vu); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, + out_r, out_g, out_b); + out_b = __lasx_xvsrli_h(out_b, 3); + out_g = __lasx_xvsrli_h(out_g, 2); + out_r = __lasx_xvsrli_h(out_r, 3); + out_g = __lasx_xvslli_h(out_g, 5); + out_r = __lasx_xvslli_h(out_r, 11); + out_r = __lasx_xvor_v(out_r, out_g); + out_r = __lasx_xvor_v(out_r, out_b); + __lasx_xvst(out_r, dst_rgb565, 0); + src_y += 16; + src_uv += 16; + dst_rgb565 += 32; + } +} + +void NV21ToARGBRow_LASX(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg; + __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv; + __m256i out_b, out_g, out_r; + __m256i const_0x80 = __lasx_xvldi(0x80); + __m256i alpha = __lasx_xvldi(0xFF); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); + vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); + vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + vec_y = __lasx_xvld(src_y, 0); + vec_uv = __lasx_xvld(src_uv, 0); + vec_uv = __lasx_xvsub_b(vec_uv, const_0x80); + vec_uv = __lasx_vext2xv_h_b(vec_uv); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, + out_g, out_r); + STOREARGB(alpha, out_r, out_g, out_b, dst_argb); + src_y += 16; + src_uv += 16; + } +} + +void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + int x; + int len = width / 32; + __m256i src0, src1, src2, src3, dst0; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1; + __m256i const_128 = __lasx_xvldi(0x480); + __m256i const_150 = __lasx_xvldi(0x96); + __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, + 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + __m256i shuff = {0x0000000400000000, 0x0000000500000001, + 0x0000000600000002, 0x0000000700000003}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp2 = __lasx_xvpickev_b(src3, src2); + tmp3 = __lasx_xvpickod_b(src3, src2); + reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); + reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150); + reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); + reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2); + dst0 = __lasx_xvpickod_b(reg1, reg0); + dst0 = __lasx_xvperm_w(dst0, shuff); + __lasx_xvst(dst0, dst_y, 0); + dst_y += 32; + src_argb += 128; + } +} + +void ARGBToUVJRow_LASX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* next_argb = src_argb + src_stride_argb; + int len = width / 32; + __m256i src0, src1, src2, src3; + __m256i nex0, nex1, nex2, nex3; + __m256i tmp0, tmp1, tmp2, tmp3; + __m256i reg0, reg1, dst0; + __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr; + __m256i const_63 = __lasx_xvldi(0x43F); + __m256i const_42 = __lasx_xvldi(0x42A); + __m256i const_21 = __lasx_xvldi(0x415); + __m256i const_53 = __lasx_xvldi(0x435); + __m256i const_10 = __lasx_xvldi(0x40A); + __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, + 0x8080808080808080, 0x8080808080808080}; + __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, + 0x1715070513110301, 0x1F1D0F0D1B190B09}; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, + 96, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64, + next_argb, 96, nex0, nex1, nex2, nex3); + tmp0 = __lasx_xvpickev_b(src1, src0); + tmp1 = __lasx_xvpickod_b(src1, src0); + tmp2 = __lasx_xvpickev_b(src3, src2); + tmp3 = __lasx_xvpickod_b(src3, src2); + tmpr = __lasx_xvpickod_b(tmp2, tmp0); + tmpb = __lasx_xvpickev_b(tmp2, tmp0); + tmpg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvpickev_b(nex1, nex0); + tmp1 = __lasx_xvpickod_b(nex1, nex0); + tmp2 = __lasx_xvpickev_b(nex3, nex2); + tmp3 = __lasx_xvpickod_b(nex3, nex2); + nexr = __lasx_xvpickod_b(tmp2, tmp0); + nexb = __lasx_xvpickev_b(tmp2, tmp0); + nexg = __lasx_xvpickev_b(tmp3, tmp1); + tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb); + tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb); + tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg); + tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg); + reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr); + reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr); + tmpb = __lasx_xvavgr_hu(tmp0, tmp1); + tmpg = __lasx_xvavgr_hu(tmp2, tmp3); + tmpr = __lasx_xvavgr_hu(reg0, reg1); + reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb); + reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr); + reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg); + reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg); + reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr); + reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb); + dst0 = __lasx_xvpackod_b(reg1, reg0); + tmp0 = __lasx_xvpermi_d(dst0, 0x44); + tmp1 = __lasx_xvpermi_d(dst0, 0xEE); + dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff); + __lasx_xvstelm_d(dst0, dst_u, 0, 0); + __lasx_xvstelm_d(dst0, dst_v, 0, 2); + __lasx_xvstelm_d(dst0, dst_u, 8, 1); + __lasx_xvstelm_d(dst0, dst_v, 8, 3); + dst_u += 16; + dst_v += 16; + src_argb += 128; + next_argb += 128; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_lsx.cc b/source/row_lsx.cc index a445e636..3e8b901a 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -152,7 +152,7 @@ extern "C" { _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ - _dst0 = __lsx_vsrlni_b_h(_reg1, _reg0, 8); \ + _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ } void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, @@ -355,7 +355,6 @@ void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, __m128i const_129 = __lsx_vldi(129); __m128i const_25 = __lsx_vldi(25); __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - __m128i shuff = {0x0B030A0209010800, 0x0F070E060D050C04}; for (x = 0; x < len; x++) { src0 = __lsx_vld(src_argb1555, 0); @@ -384,8 +383,7 @@ void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - dst0 = __lsx_vshuf_b(dst0, dst0, shuff); + dst0 = __lsx_vpackod_b(reg1, reg0); __lsx_vst(dst0, dst_y, 0); dst_y += 16; src_argb1555 += 32; @@ -468,7 +466,6 @@ void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { __m128i const_129 = __lsx_vldi(129); __m128i const_25 = __lsx_vldi(25); __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - __m128i shuff = {0x0B030A0209010800, 0x0F070E060D050C04}; for (x = 0; x < len; x++) { src0 = __lsx_vld(src_rgb565, 0); @@ -495,8 +492,7 @@ void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129); reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66); reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - dst0 = __lsx_vshuf_b(dst0, dst0, shuff); + dst0 = __lsx_vpackod_b(reg1, reg0); __lsx_vst(dst0, dst_y, 0); dst_y += 16; src_rgb565 += 32; @@ -591,7 +587,7 @@ void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + dst0 = __lsx_vpickod_b(reg1, reg0); __lsx_vst(dst0, dst_y, 0); dst_y += 16; src_rgb24 += 48; @@ -939,7 +935,7 @@ void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150); reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + dst0 = __lsx_vpickod_b(reg1, reg0); __lsx_vst(dst0, dst_y, 0); dst_y += 16; src_argb += 64; @@ -1228,7 +1224,7 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb, reg1 = __lsx_vmsub_h(reg1, const_53, tmpg); reg0 = __lsx_vmsub_h(reg0, const_21, tmpr); reg1 = __lsx_vmsub_h(reg1, const_10, tmpb); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); + dst0 = __lsx_vpickod_b(reg1, reg0); __lsx_vstelm_d(dst0, dst_u, 0, 0); __lsx_vstelm_d(dst0, dst_v, 0, 1); dst_u += 8; |