diff options
author | Frank Barchard <fbarchard@google.com> | 2022-03-17 15:50:29 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2022-03-18 07:22:36 +0000 |
commit | 95b14b24462d67aede96e30243694732f9471e63 (patch) | |
tree | 0cbcd60504d41cf6d588ca59d59fe6ee31d7901d | |
parent | 3aebf69d668177e7ee6dbbe0025e5c3dbb525ff2 (diff) | |
download | libyuv-95b14b24462d67aede96e30243694732f9471e63.tar.gz |
RAWToJ400 faster version for ARM
- Unrolled to 16 pixels
- Take constants via structure, allowing different colorspace and channel order
- Use ADDHN to add 16.5 and take upper 8 bits of 16 bit values, narrowing to 8 bits
- clang-format applied, affecting mips code
On Cortex A510
Was RAWToJ400_Opt (1623 ms)
Now RAWToJ400_Opt (862 ms)
C RAWToJ400_Opt (1627 ms)
Bug: b/220171611
Change-Id: I06a9baf9650ebe2802fb6ff6dfbd524e2c06ada0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3534023
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | README.chromium | 4 | ||||
-rw-r--r-- | include/libyuv/convert_argb.h | 4 | ||||
-rw-r--r-- | include/libyuv/loongson_intrinsics.h | 62 | ||||
-rw-r--r-- | include/libyuv/macros_msa.h | 54 | ||||
-rw-r--r-- | include/libyuv/row.h | 12 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert.cc | 80 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 32 | ||||
-rw-r--r-- | source/planar_functions.cc | 2 | ||||
-rw-r--r-- | source/row_any.cc | 20 | ||||
-rw-r--r-- | source/row_lasx.cc | 187 | ||||
-rw-r--r-- | source/row_neon.cc | 349 | ||||
-rw-r--r-- | source/row_neon64.cc | 355 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 22 |
14 files changed, 533 insertions, 652 deletions
diff --git a/README.chromium b/README.chromium index 940a0142..5bcfae87 100644 --- a/README.chromium +++ b/README.chromium @@ -1,8 +1,8 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1813 +Version: 1814 License: BSD License File: LICENSE Description: -libyuv is an open source project that includes YUV conversion and scaling functionality.
\ No newline at end of file +libyuv is an open source project that includes YUV conversion and scaling functionality. diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 9b0529cd..f66d20ce 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -14,7 +14,7 @@ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" // For enum RotationMode. -#include "libyuv/scale.h" // For enum FilterMode. +#include "libyuv/scale.h" // For enum FilterMode. #ifdef __cplusplus namespace libyuv { @@ -2117,7 +2117,7 @@ int P210ToARGBMatrixFilter(const uint16_t* src_y, const struct YuvConstants* yuvconstants, int width, int height, - enum FilterMode filter) ; + enum FilterMode filter); // Convert P010 to AR30 with matrix and UV filter mode. LIBYUV_API diff --git a/include/libyuv/loongson_intrinsics.h b/include/libyuv/loongson_intrinsics.h index 79b5d0e4..1d613def 100644 --- a/include/libyuv/loongson_intrinsics.h +++ b/include/libyuv/loongson_intrinsics.h @@ -91,7 +91,8 @@ * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, + __m128i in_h, __m128i in_l) { __m128i out; @@ -117,7 +118,8 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, + __m128i in_h, __m128i in_l) { __m128i out; @@ -143,7 +145,8 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, * out : -4,-24,-60,-112, 6,26,62,114 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, + __m128i in_h, __m128i in_l) { __m128i out; @@ -169,7 +172,8 @@ static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, * out : 23,40,41,26 * ============================================================================= */ -static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, + __m128i in_h, __m128i in_l) { __m128i out; @@ -414,8 +418,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) { _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ _out7) \ { \ - __m128i zero = { 0 }; \ - __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ + __m128i zero = {0}; \ + __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ \ _t0 = __lsx_vilvl_b(_in2, _in0); \ @@ -828,7 +832,8 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -851,7 +856,8 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -874,7 +880,8 @@ static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -901,7 +908,8 @@ static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, * out : 23,40,41,26, 23,40,41,26 * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -924,7 +932,8 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -947,7 +956,8 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -971,7 +981,8 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -1000,7 +1011,8 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, * out : -7,-3,0,0, 0,-1,0,-1 * ============================================================================= */ -static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i out; @@ -1201,7 +1213,8 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 * ============================================================================= */ -static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i tmp0, tmp1, out; @@ -1225,7 +1238,8 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) * ============================================================================= */ -static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, + __m256i in_h, __m256i in_l) { __m256i tmp0, tmp1, out; @@ -1303,7 +1317,7 @@ static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { */ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { __m256i tmp1, out; - __m256i zero = { 0 }; + __m256i zero = {0}; tmp1 = __lasx_xvilvl_b(zero, in_l); out = __lasx_xvsadd_hu(in_h, tmp1); @@ -1921,12 +1935,14 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { * VP:1,2,3,4, * ============================================================================= */ -#define VECT_PRINT(RTYPE, element_num, in0, enter) \ - { \ - RTYPE _tmp0 = (RTYPE)in0; \ - int _i = 0; \ - if (enter) printf("\nVP:"); \ - for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ +#define VECT_PRINT(RTYPE, element_num, in0, enter) \ + { \ + RTYPE _tmp0 = (RTYPE)in0; \ + int _i = 0; \ + if (enter) \ + printf("\nVP:"); \ + for (_i = 0; _i < element_num; _i++) \ + printf("%d,", _tmp0[_i]); \ } #endif /* LOONGSON_INTRINSICS_H */ diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h index 7a1dc4af..b9a44fcc 100644 --- a/include/libyuv/macros_msa.h +++ b/include/libyuv/macros_msa.h @@ -81,37 +81,35 @@ }) #endif // !(__mips == 64) #else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ - uint32_t val_lw_m; \ - \ - __asm__ volatile ( \ - "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ - "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ - \ - : [val_lw_m] "=&r"(val_lw_m) \ - : [psrc_lw_m] "r"(psrc_lw_m) \ - ); \ - \ - val_lw_m; \ +#define LW(psrc) \ + ({ \ + uint8_t* psrc_lw_m = (uint8_t*)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ volatile( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ - uint64_t val_ld_m = 0; \ - \ - __asm__ volatile ( \ - "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ - "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ - \ - : [val_ld_m] "=&r" (val_ld_m) \ - : [psrc_ld_m] "r" (psrc_ld_m) \ - ); \ - \ - val_ld_m; \ +#define LD(psrc) \ + ({ \ + uint8_t* psrc_ld_m = (uint8_t*)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ volatile( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d611b8b3..2be84338 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -1380,7 +1380,9 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); -void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width); void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); @@ -2748,7 +2750,9 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, int width); void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); @@ -2851,8 +2855,8 @@ void RGB24ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_Any_LASX(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); + uint8_t* dst_ptr, + int width); void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e9ad82d9..d0a46371 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1813 +#define LIBYUV_VERSION 1814 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 8f02636d..45590a7b 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1422,7 +1422,7 @@ int ARGBToI420(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1658,7 +1658,7 @@ int ABGRToI420(const uint8_t* src_abgr, #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_NEON; } } @@ -1754,7 +1754,7 @@ int RGBAToI420(const uint8_t* src_rgba, #if defined(HAS_RGBATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYRow = RGBAToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGBAToYRow = RGBAToYRow_NEON; } } @@ -1855,11 +1855,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYRow = RGB24ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } + RGB24ToUVRow = RGB24ToUVRow_NEON; } } #endif @@ -2031,11 +2029,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVJRow = RGB24ToUVJRow_NEON; - } + RGB24ToUVJRow = RGB24ToUVJRow_NEON; } } #endif @@ -2095,18 +2091,18 @@ int RGB24ToJ420(const uint8_t* src_rgb24, #endif #endif // HAS_RGB24TOYJROW - { +{ #if !defined(HAS_RGB24TOYJROW) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYJROW) - RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYJRow(src_rgb24, dst_y, width); - RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); + RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); + RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); @@ -2114,26 +2110,26 @@ int RGB24ToJ420(const uint8_t* src_rgb24, ARGBToYJRow(row, dst_y, width); ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); #endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { #if defined(HAS_RGB24TOYJROW) - RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYJRow(src_rgb24, dst_y, width); + RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYJRow(src_rgb24, dst_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); ARGBToUVJRow(row, 0, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); #endif - } + } #if !defined(HAS_RGB24TOYJROW) - free_aligned_buffer_64(row); + free_aligned_buffer_64(row); #endif - } - return 0; +} +return 0; } #undef HAS_RGB24TOYJROW @@ -2187,11 +2183,9 @@ int RAWToI420(const uint8_t* src_raw, if (TestCpuFlag(kCpuHasNEON)) { RAWToUVRow = RAWToUVRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RAWToYRow = RAWToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } + RAWToUVRow = RAWToUVRow_NEON; } } #endif @@ -2363,11 +2357,9 @@ int RAWToJ420(const uint8_t* src_raw, if (TestCpuFlag(kCpuHasNEON)) { RAWToUVJRow = RAWToUVJRow_Any_NEON; RAWToYJRow = RAWToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVJRow = RAWToUVJRow_NEON; - } + RAWToUVJRow = RAWToUVJRow_NEON; } } #endif @@ -2521,8 +2513,8 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } // MSA version does direct RGB565 to YUV. -#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) \ - || defined(HAS_RGB565TOYROW_LASX)) +#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \ + defined(HAS_RGB565TOYROW_LASX)) #if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToUVRow = RGB565ToUVRow_Any_MSA; @@ -2701,8 +2693,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } // MSA version does direct ARGB1555 to YUV. -#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) \ - || defined(HAS_ARGB1555TOYROW_LASX)) +#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \ + defined(HAS_ARGB1555TOYROW_LASX)) #if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; @@ -3067,7 +3059,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24, #if defined(HAS_RGB24TOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_NEON; } } diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 89856e25..e50c2af3 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -103,7 +103,7 @@ int ARGBToI444(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -205,7 +205,7 @@ int ARGBToI422(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -283,7 +283,7 @@ int ARGBToNV12(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -477,7 +477,7 @@ int ARGBToNV21(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -638,7 +638,7 @@ int ABGRToNV12(const uint8_t* src_abgr, #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_NEON; } } @@ -790,7 +790,7 @@ int ABGRToNV21(const uint8_t* src_abgr, #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_NEON; } } @@ -947,7 +947,7 @@ int ARGBToYUY2(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1110,7 +1110,7 @@ int ARGBToUYVY(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1249,7 +1249,7 @@ int ARGBToI400(const uint8_t* src_argb, #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_NEON; } } @@ -1906,7 +1906,7 @@ int ARGBToJ420(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -2029,7 +2029,7 @@ int ARGBToJ422(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -2248,7 +2248,7 @@ int ARGBToJ400(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } @@ -2314,7 +2314,7 @@ int RGBAToJ400(const uint8_t* src_rgba, #if defined(HAS_RGBATOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYJRow = RGBAToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RGBAToYJRow = RGBAToYJRow_NEON; } } @@ -2387,11 +2387,9 @@ int RAWToJNV21(const uint8_t* src_raw, if (TestCpuFlag(kCpuHasNEON)) { RAWToUVJRow = RAWToUVJRow_Any_NEON; RAWToYJRow = RAWToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVJRow = RAWToUVJRow_NEON; - } + RAWToUVJRow = RAWToUVJRow_NEON; } } #endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0c2ae2b1..43dc1892 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4186,7 +4186,7 @@ static int ARGBSobelize(const uint8_t* src_argb, #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index e79d23de..2d30e0a5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -987,7 +987,7 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_NEON -ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) @@ -996,10 +996,10 @@ ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_ARGBTOYJROW_NEON -ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYJROW_NEON -ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7) +ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) @@ -1011,7 +1011,7 @@ ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15) ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_BGRATOYROW_NEON -ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) @@ -1020,7 +1020,7 @@ ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_ABGRTOYROW_NEON -ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) @@ -1029,7 +1029,7 @@ ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYROW_NEON -ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) @@ -1038,7 +1038,7 @@ ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) #endif #ifdef HAS_RGB24TOYROW_NEON -ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_AVX2 ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) @@ -1047,7 +1047,7 @@ ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_NEON -ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7) +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) @@ -1059,7 +1059,7 @@ ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYROW_NEON -ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_AVX2 ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) @@ -1068,7 +1068,7 @@ ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_NEON -ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7) +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 12891ff5..7dd18f40 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -23,14 +23,14 @@ extern "C" { #define ALPHA_VAL (-1) // Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ - { \ - ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ - vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ - ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ - vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ - yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ - yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ + { \ + ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \ + vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \ + ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \ + vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \ + yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \ + yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ } // Load 32 YUV422 pixel data @@ -191,27 +191,26 @@ extern "C" { pdst_argb += 64; \ } -#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ - { \ - __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ - _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ - _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ - _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ - _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ - _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ - _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ - _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ - _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ - _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ - _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ - _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ - _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ - _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ - _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ - _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \ + _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \ } - void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) { int x; int len = width / 64; @@ -596,8 +595,8 @@ void I422ToARGB1555Row_LASX(const uint8_t* src_y, __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; __m256i vec_ubvr, vec_ugvg; __m256i const_0x80 = __lasx_xvldi(0x80); - __m256i alpha = {0x8000800080008000, 0x8000800080008000, - 0x8000800080008000, 0x8000800080008000}; + __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000, + 0x8000800080008000}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr); @@ -1507,14 +1506,14 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, __m256i dst0, dst1, dst2, dst3; __m256i reg0, reg1, reg2, reg3; __m256i alpha = __lasx_xvldi(0xFF); - __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, - 0x131211100F0E0D0C, 0x1B1A191817161514}; - __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, - 0x1F1E1D1C1B1A1918, 0x0706050403020100}; - __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, - 0x0B0A090807060504, 0x131211100F0E0D0C}; - __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, - 0x1005040310020100, 0x100B0A0910080706}; + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100, + 0x100B0A0910080706}; for (x = 0; x < len; x++) { reg0 = __lasx_xvld(src_rgb24, 0); @@ -1523,7 +1522,8 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); @@ -1545,14 +1545,14 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3; __m256i dst0, dst1, dst2, dst3; __m256i alpha = __lasx_xvldi(0xFF); - __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, - 0x131211100F0E0D0C, 0x1B1A191817161514}; - __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, - 0x1F1E1D1C1B1A1918, 0x0706050403020100}; - __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, - 0x0B0A090807060504, 0x131211100F0E0D0C}; - __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, - 0x1003040510000102, 0x10090A0B10060708}; + __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C, + 0x1B1A191817161514}; + __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918, + 0x0706050403020100}; + __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504, + 0x131211100F0E0D0C}; + __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102, + 0x10090A0B10060708}; for (x = 0; x < len; x++) { reg0 = __lasx_xvld(src_raw, 0); @@ -1561,7 +1561,8 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, + tmp1); tmp2 = __lasx_xvshuf_b(src1, src2, shuf2); DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha, tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3); @@ -1577,8 +1578,8 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { + uint8_t* dst_y, + int width) { int x; int len = width / 32; __m256i src0, src1; @@ -1646,8 +1647,8 @@ void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555, 0x8080808080808080, 0x8080808080808080}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, - 0, next_argb1555, 32, src0, src1, src2, src3); + DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0, + next_argb1555, 32, src0, src1, src2, src3); DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2); DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3); tmpb = __lasx_xvandi_b(tmp0, 0x1F); @@ -1821,14 +1822,14 @@ void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { 0x4219421942194219, 0x4219421942194219}; __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, - 0x0B09080605030200, 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, - 0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, - 0x000A000700040001, 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, - 0x0002001F001C0019, 0x000E000B00080005}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, + 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, + 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, + 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, + 0x000E000B00080005}; for (x = 0; x < len; x++) { reg0 = __lasx_xvld(src_rgb24, 0); @@ -1887,8 +1888,8 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64, next_rgb24, 0, reg0, reg1, reg2, tmp0); DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, - reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, nexb); @@ -1926,14 +1927,14 @@ void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { 0x1942194219421942, 0x1942194219421942}; __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, - 0x0B09080605030200, 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, - 0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, - 0x000A000700040001, 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, - 0x0002001F001C0019, 0x000E000B00080005}; + __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, + 0x17151412110F0E0C}; + __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, + 0x0F0D0C0A09070604}; + __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, + 0x001600130010000D}; + __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, + 0x000E000B00080005}; for (x = 0; x < len; x++) { reg0 = __lasx_xvld(src_raw, 0); @@ -1989,24 +1990,24 @@ void RAWToUVRow_LASX(const uint8_t* src_raw, 0x0706050403020100, 0x1F1C191613100908}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, - next_raw, 0, reg0, reg1, reg2, tmp0); + DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0, + reg0, reg1, reg2, tmp0); DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2); - DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, - reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); + DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1, + 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0); DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, - tmpb, nexb); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, - tmpg, nexg); - DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, - tmpr, nexr); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, - tmpb, nexb); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, - tmpg, nexg); - DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, - tmpr, nexr); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr, + nexr); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb, + nexb); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg, + nexg); + DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr, + nexr); RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1); dst0 = __lasx_xvpickod_b(reg1, reg0); __lasx_xvstelm_d(dst0, dst_u, 0, 0); @@ -2071,8 +2072,8 @@ void NV12ToRGB565Row_LASX(const uint8_t* src_y, vec_vu = __lasx_xvld(src_uv, 0); vec_vu = __lasx_xvsub_b(vec_vu, const_0x80); vec_vu = __lasx_vext2xv_h_b(vec_vu); - YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, - out_r, out_g, out_b); + YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g, + out_b); out_b = __lasx_xvsrli_h(out_b, 3); out_g = __lasx_xvsrli_h(out_g, 2); out_r = __lasx_xvsrli_h(out_r, 3); @@ -2109,8 +2110,8 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y, vec_uv = __lasx_xvld(src_uv, 0); vec_uv = __lasx_xvsub_b(vec_uv, const_0x80); vec_uv = __lasx_vext2xv_h_b(vec_uv); - YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, - out_g, out_r); + YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g, + out_r); STOREARGB(alpha, out_r, out_g, out_b, dst_argb); src_y += 16; src_uv += 16; @@ -2127,8 +2128,8 @@ void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { __m256i const_150 = __lasx_xvldi(0x96); __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - __m256i shuff = {0x0000000400000000, 0x0000000500000001, - 0x0000000600000002, 0x0000000700000003}; + __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, + 0x0000000700000003}; for (x = 0; x < len; x++) { DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, @@ -2169,8 +2170,8 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb, __m256i const_10 = __lasx_xvldi(0x40A); __m256i const_8080 = {0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080}; - __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, - 0x1715070513110301, 0x1F1D0F0D1B190B09}; + __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301, + 0x1F1D0F0D1B190B09}; for (x = 0; x < len; x++) { DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, diff --git a/source/row_neon.cc b/source/row_neon.cc index e1063236..aa3ac70a 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1645,29 +1645,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "q0", "q1", "q2", "q3"); } -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { @@ -1686,48 +1663,6 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d1, d24 \n" // B - "vmlal.u8 q2, d2, d25 \n" // G - "vmlal.u8 q2, d3, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, @@ -1747,15 +1682,13 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "vmull.u8 q2, d0, d24 \n" // B "vmlsl.u8 q2, d1, d25 \n" // G "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned "vmull.u8 q3, d2, d24 \n" // R "vmlsl.u8 q3, d1, d28 \n" // G "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned + "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. @@ -1775,13 +1708,11 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \ + "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. @@ -2559,161 +2490,169 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; + +struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080}; + +struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d0, d20 \n" // B + "vmull.u8 q9, d1, d20 \n" + "vmlal.u8 q8, d2, d21 \n" // G + "vmlal.u8 q9, d3, d21 \n" + "vmlal.u8 q8, d4, d22 \n" // R + "vmlal.u8 q9, d5, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); } -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); } -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "vld1.8 {d0}, [%3] \n" // load rgbconstants + "vdup.u8 d20, d0[0] \n" + "vdup.u8 d21, d0[1] \n" + "vdup.u8 d22, d0[2] \n" + "vdup.u16 q12, d0[2] \n" "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); + "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of + // RGB24. + "vld3.8 {d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmull.u8 q8, d2, d20 \n" // B + "vmull.u8 q9, d3, d20 \n" + "vmlal.u8 q8, d4, d21 \n" // G + "vmlal.u8 q9, d5, d21 \n" + "vmlal.u8 q8, d6, d22 \n" // R + "vmlal.u8 q9, d7, d22 \n" + "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y + "vaddhn.u16 d1, q9, q12 \n" + "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", + "q12"); } void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - asm volatile( - "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R - "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); } void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - asm volatile( - "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // R - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // B - "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); } // Bilinear filter 16x2 -> 16x1 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 7139ead7..ad355f2a 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2021,30 +2021,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { @@ -2063,50 +2039,6 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v1.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v2.8b, v5.8b \n" // G - "umlal v0.8h, v3.8b, v6.8b \n" // R - "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, @@ -2124,18 +2056,16 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + "prfm pldl1keep, [%0, 448] \n" "umull v3.8h, v2.8b, v24.8b \n" // R "umlsl v3.8h, v1.8b, v28.8b \n" // G "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned + "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. @@ -2166,10 +2096,8 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ + "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. @@ -2807,168 +2735,169 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; + +struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080}; + +struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R + "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); +} + +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant + "ldr d0, [%3] \n" // load rgbconstants + "dup v6.16b, v0.b[0] \n" + "dup v7.16b, v0.b[1] \n" + "dup v16.16b, v0.b[2] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R + "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v6.8b \n" // B + "umull2 v1.8h, v2.16b, v6.16b \n" "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + "umlal v0.8h, v3.8b, v7.8b \n" // G + "umlal2 v1.8h, v3.16b, v7.16b \n" + "umlal v0.8h, v4.8b, v16.8b \n" // R + "umlal2 v1.8h, v4.16b, v16.16b \n" + "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); } -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); } -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "ldr d0, [%3] \n" // load rgbconstants + "dup v5.16b, v0.b[0] \n" + "dup v6.16b, v0.b[1] \n" + "dup v7.16b, v0.b[2] \n" + "dup v16.8h, v0.h[2] \n" "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B + "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "umull v0.8h, v2.8b, v5.8b \n" // B + "umull2 v1.8h, v2.16b, v5.16b \n" "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : + "umlal v0.8h, v3.8b, v6.8b \n" // G + "umlal2 v1.8h, v3.16b, v6.16b \n" + "umlal v0.8h, v4.8b, v7.8b \n" // R + "umlal2 v1.8h, v4.16b, v7.16b \n" + "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y + "addhn v1.8b, v1.8h, v16.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); } void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - asm volatile( - "movi v6.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v4.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); } // Bilinear filter 16x2 -> 16x1 diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index de90f660..185c5aa4 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1314,7 +1314,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2) -TESTATOBIPLANAR(RAW, 1, 3, JNV21, 2, 2) +TESTATOBIPLANAR(RAW, 1, 3, JNV21, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) @@ -3813,14 +3813,18 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10) #define P216ToAR30(a, b, c, d, e, f, g, h) \ P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) -#define P010ToARGBFilter(a, b, c, d, e, f, g, h) \ - P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear) -#define P210ToARGBFilter(a, b, c, d, e, f, g, h) \ - P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear) -#define P010ToAR30Filter(a, b, c, d, e, f, g, h) \ - P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear) -#define P210ToAR30Filter(a, b, c, d, e, f, g, h) \ - P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear) +#define P010ToARGBFilter(a, b, c, d, e, f, g, h) \ + P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \ + kFilterBilinear) +#define P210ToARGBFilter(a, b, c, d, e, f, g, h) \ + P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \ + kFilterBilinear) +#define P010ToAR30Filter(a, b, c, d, e, f, g, h) \ + P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \ + kFilterBilinear) +#define P210ToAR30Filter(a, b, c, d, e, f, g, h) \ + P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \ + kFilterBilinear) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) |