diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-23 15:54:02 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-03-23 23:45:54 +0000 |
commit | d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (patch) | |
tree | 6201f9cab35550653480bc372580d2c5014d074d /source | |
parent | b046131c0bd44ca3a11276194d07b85373cfd608 (diff) | |
download | libyuv-d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a.tar.gz |
Add RAWToJ420
Add J420 output from RAW.
Optimize RGB24 and RAW To J420 on ARM by using NEON for the 2 step conversion.
Also fix sign-compare warning that was breaking Windows build
Bug: libyuv:887, b/183534734
Change-Id: I8c39334552dc0b28414e638708db413d6adf8d6e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2783382
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Diffstat (limited to 'source')
-rw-r--r-- | source/convert.cc | 262 | ||||
-rw-r--r-- | source/convert_argb.cc | 12 | ||||
-rw-r--r-- | source/row_gcc.cc | 30 | ||||
-rw-r--r-- | source/row_neon.cc | 6 | ||||
-rw-r--r-- | source/row_neon64.cc | 12 |
5 files changed, 273 insertions, 49 deletions
diff --git a/source/convert.cc b/source/convert.cc index 1bd59659..768e0f37 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1368,6 +1368,18 @@ int ARGBToI420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1388,22 +1400,6 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; @@ -1771,7 +1767,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, } // Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYROW_NEON) +#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; @@ -1808,6 +1804,14 @@ int RGB24ToI420(const uint8_t* src_rgb24, #endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1816,6 +1820,18 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1960,6 +1976,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } #endif #else +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1968,6 +1992,18 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; @@ -2111,6 +2147,26 @@ int RAWToI420(const uint8_t* src_raw, #endif // Other platforms do intermediate conversion from RAW to ARGB. #else +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -2186,6 +2242,178 @@ int RAWToI420(const uint8_t* src_raw, return 0; } +// TODO(fbarchard): Use Matrix version to implement I420 and J420. +// Convert RAW to J420. +LIBYUV_API +int RAWToJ420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVJRow = RAWToUVJRow_NEON; + } + } + } +// MMI and MSA version does direct RAW to YUV. +#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA)) +#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToUVJRow = RAWToUVJRow_Any_MMI; + RAWToYJRow = RAWToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_MMI; + if (IS_ALIGNED(width, 16)) { + RAWToUVJRow = RAWToUVJRow_MMI; + } + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif +#else +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#endif + + { +#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + // Convert RGB565 to I420. LIBYUV_API int RGB565ToI420(const uint8_t* src_rgb565, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 0bd330ec..16b5ff92 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3376,14 +3376,14 @@ int AR30ToAB30(const uint8_t* src_ar30, // Convert AR64 to ARGB. LIBYUV_API int AR64ToARGB(const uint16_t* src_ar64, - int src_stride_ar64, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, - int width) = AR64ToARGBRow_C; + int width) = AR64ToARGBRow_C; if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { return -1; } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 49d45397..1b4ad9b0 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1116,8 +1116,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, asm volatile( "movdqa %3,%%xmm2 \n" - "movdqa %4,%%xmm3 \n" - LABELALIGN + "movdqa %4,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1129,11 +1128,11 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %4 + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } @@ -1166,8 +1165,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64, int width) { asm volatile( - "movdqa %3,%%xmm2 \n" - LABELALIGN + "movdqa %3,%%xmm2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1220,8 +1218,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, asm volatile( "vbroadcastf128 %3,%%ymm2 \n" - "vbroadcastf128 %4,%%ymm3 \n" - LABELALIGN + "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1233,11 +1230,11 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %3 + "m"(kShuffleARGBToAB64Hi) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif @@ -1275,8 +1272,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ar64, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" - LABELALIGN + "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index 5414d1ef..5d109a3b 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2191,7 +2191,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, : "cc", "memory", "q0", "q1", "q2", "q3"); } -static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, @@ -2362,9 +2362,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B + "vmull.u8 q4, d0, d4 \n" // R "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R + "vmlal.u8 q4, d2, d6 \n" // B "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 9662cd3c..3281e90f 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1628,10 +1628,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAR64ToARGB) // %3 + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAR64ToARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } @@ -2506,9 +2506,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B + "umull v0.8h, v0.8b, v4.8b \n" // R "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R + "umlal v0.8h, v2.8b, v6.8b \n" // B "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" |