diff options
author | Frank Barchard <fbarchard@google.com> | 2023-06-16 14:13:54 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-06-16 21:37:53 +0000 |
commit | a366ad714a37e6fd435914fd4df0ffa36b244cba (patch) | |
tree | 9ec852fbcf8945306e258cc994431a593ab4ca21 | |
parent | 04821d1e7d60845525e8db55c7bcd41ef5be9406 (diff) | |
download | libyuv-a366ad714a37e6fd435914fd4df0ffa36b244cba.tar.gz |
ARGBAttenuate use (a + b + 255) >> 8
- Makes ARM and Intel match and fixes some off by 1 cases
- Add ARGBToUV444MatrixRow_NEON
- Add ConvertFP16ToFP32Column_NEON
- scale_rvv fix intinsic build error
- disable row_win version of ARGBAttenuate/Unattenuate
Bug: libyuv:936, libyuv:956
Change-Id: Ied99aaad3a11a8eb69212b628c58f86ec0723c38
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4617013
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/row.h | 21 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/row_common.cc | 8 | ||||
-rw-r--r-- | source/row_gcc.cc | 101 | ||||
-rw-r--r-- | source/row_neon.cc | 102 | ||||
-rw-r--r-- | source/row_neon64.cc | 179 | ||||
-rw-r--r-- | source/scale_rvv.cc | 3 | ||||
-rw-r--r-- | unit_test/planar_test.cc | 149 |
9 files changed, 390 insertions, 177 deletions
diff --git a/README.chromium b/README.chromium index 615b7358..aecf7085 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1872 +Version: 1873 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8df334b1..eb7a8d06 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -161,7 +161,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_BLENDPLANEROW_SSSE3 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -171,9 +170,6 @@ extern "C" { #define HAS_SOBELXROW_SSE2 #define HAS_SOBELXYROW_SSE2 #define HAS_SOBELYROW_SSE2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_SSSE3 -#endif // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. @@ -241,11 +237,7 @@ extern "C" { #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 -#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_AVX2 -#endif #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER) @@ -285,14 +277,15 @@ extern "C" { #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ABGRTOYJROW_SSSE3 #define HAS_AR64TOARGBROW_SSSE3 +#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 +#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 -#define HAS_DETILEROW_SSE2 #define HAS_DETILEROW_16_SSE2 -#define HAS_DETILEROW_16_AVX +#define HAS_DETILEROW_SSE2 #define HAS_DETILESPLITUVROW_SSSE3 #define HAS_DETILETOYUY2_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 @@ -345,13 +338,16 @@ extern "C" { #define HAS_ABGRTOYJROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_AR64TOARGBROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBTOAB64ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_DETILEROW_16_AVX #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 @@ -6190,6 +6186,11 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 float* dst, int width); +// Convert a column of FP16 Half Floats to a row of FP32 Floats +void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 + int src_stride, // stride in elements + float* dst, + int width); // Convert FP32 Floats to FP16 Half Floats void ConvertFP32ToFP16Row_NEON(const float* src, uint16_t* dst, // fp16 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8df341d4..1888d67a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1872 +#define LIBYUV_VERSION 1873 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index 8be37fb5..fdd49a65 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -48,7 +48,6 @@ extern "C" { defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 #define LIBYUV_RGBTOU_TRUNCATE 1 -#define LIBYUV_ATTENUATE_DUP 1 #endif #if defined(LIBYUV_BIT_EXACT) #define LIBYUV_UNATTENUATE_DUP 1 @@ -3369,12 +3368,7 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND -#if LIBYUV_ATTENUATE_DUP -// This code mimics the SSSE3 version for better testability. -#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 -#else -#define ATTENUATE(f, a) (f * a + 128) >> 8 -#endif +#define ATTENUATE(f, a) (f * a + 255) >> 8 // Multiply source RGB by alpha and store to destination. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index e94fd04d..d8074987 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -7441,93 +7441,106 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, - 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; -static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, + -128, -128, 14, -128, 14, -128, + 14, -128, -128, -128}; + // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "punpcklbw %%xmm6,%%xmm7 \n" + "sub %0,%1 \n" // 4 pixel loop. LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" + "movdqu (%0),%%xmm6 \n" + "movdqa %%xmm6,%%xmm0 \n" + "movdqa %%xmm6,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0 + "pshufb %%xmm4,%%xmm3 \n" + "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha + "pmullw %%xmm3,%%xmm1 \n" + "paddw %%xmm7,%%xmm0 \n" // + 255 + "paddw %%xmm7,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "pand %%xmm5,%%xmm6 \n" + "por %%xmm6,%%xmm0 \n" + "movdqu %%xmm0,(%0,%1) \n" + "lea 0x10(%0),%0 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenuateShuffle) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 + // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; +static const lvec8 kAttenuateShuffle_AVX2 = { + 6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14, + -128, 14, -128, -128, -128, 22, -128, 22, -128, 22, -128, + -128, -128, 30, -128, 30, -128, 30, -128, -128, -128}; + // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + "vmovdqa %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" + "vpunpcklbw %%ymm6,%%ymm7,%%ymm7 \n" "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n" "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpmullw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm1,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm6,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%0,%1,1) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenuateShuffle_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBATTENUATEROW_AVX2 diff --git a/source/row_neon.cc b/source/row_neon.cc index 4ed13638..31142a90 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1827,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +struct RgbUVConstants { + uint8_t kRGBToU[4]; + uint8_t kRGBToV[4]; +}; + // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 - // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + + "vld1.8 {d0}, [%4] \n" // load rgbuvconstants + "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient + "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient + "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient + "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient + "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -1857,15 +1865,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(rgbuvconstants) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"); } +// RGB to bt601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = 74 +// UR -0.2969 coefficient = 38 +// VB -0.1406 coefficient = 18 +// VG -0.7344 coefficient = 94 +// VR 0.875 coefficient = 112 (ignored) + +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, + {18, 94, 112, 0}}; + +// RGB to JPeg coefficients +// UB coeff 0.500 = 127 +// UG coeff -0.33126 = 84 +// UR coeff -0.16874 = 43 +// VB coeff -0.08131 = 20 +// VG coeff -0.41869 = 107 +// VR coeff 0.500 = 127 (ignored) + +static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0}, + {20, 107, 127, 0}}; + +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstants); +} + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPegUVConstants); +} + // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ @@ -2702,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; - uint16_t pad; }; // RGB to JPeg coefficients @@ -2710,11 +2755,9 @@ struct RgbConstants { // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2723,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // Add 16.5 = 0x1080 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; + 0x1080}; -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, @@ -3058,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( + "vmov.u16 q15, #0x00ff \n" // 255 for rounding up + // Attenuate 8 pixels. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. @@ -3065,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "vmull.u8 q10, d0, d3 \n" // b * a "vmull.u8 q11, d1, d3 \n" // g * a "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8 + "vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8 + "vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); + : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15"); } // Quantize 8 ARGB pixels (32 bytes). diff --git a/source/row_neon64.cc b/source/row_neon64.cc index a341dc13..a5c24e84 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +struct RgbUVConstants { + uint8_t kRGBToU[4]; + uint8_t kRGBToV[4]; +}; + // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 - // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "ldr d0, [%4] \n" // load rgbuvconstants + "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient + "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient + "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient + "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient + "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient + "movi v29.16b, #0x80 \n" // 128.5 + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -2229,15 +2236,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(rgbuvconstants) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", "v27", "v28", "v29"); } +// RGB to bt601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = 74 +// UR -0.2969 coefficient = 38 +// VB -0.1406 coefficient = 18 +// VG -0.7344 coefficient = 94 +// VR 0.875 coefficient = 112 (ignored) + +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, + {18, 94, 112, 0}}; + +// RGB to JPeg coefficients +// UB coeff 0.500 = 127 +// UG coeff -0.33126 = 84 +// UR coeff -0.16874 = 43 +// VB coeff -0.08131 = 20 +// VG coeff -0.41869 = 107 +// VR coeff 0.500 = 127 (ignored) + +static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0}, + {20, 107, 127, 0}}; + +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstants); +} + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPegUVConstants); +} + #define RGBTOUV_SETUP_REG \ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ @@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; - uint16_t pad; }; -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; - // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, @@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "v17"); } +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; + void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); } @@ -3402,24 +3441,26 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( + "movi v7.8h, #0x00ff \n" // 255 for rounding up + // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "prfm pldl1keep, [%0, 448] \n" - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8 + "addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8 + "addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Quantize 8 ARGB pixels (32 bytes). @@ -3980,6 +4021,46 @@ void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 : "cc", "memory", "v1", "v2", "v3"); } +// Convert FP16 Half Floats to FP32 Floats +// Read a column and write a row +void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 + int src_stride, // stride in elements + float* dst, + int width) { + asm volatile( + "cmp %w2, #8 \n" // Is there 8 rows? + "b.lo 2f \n" + "1: \n" + "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats + "ld1 {v0.h}[1], [%0], %3 \n" + "ld1 {v0.h}[2], [%0], %3 \n" + "ld1 {v0.h}[3], [%0], %3 \n" + "ld1 {v1.h}[0], [%0], %3 \n" + "ld1 {v1.h}[1], [%0], %3 \n" + "ld1 {v1.h}[2], [%0], %3 \n" + "ld1 {v1.h}[3], [%0], %3 \n" + "subs %w2, %w2, #8 \n" // 8 rows per loop + "prfm pldl1keep, [%0, 448] \n" + "fcvtl v2.4s, v0.4h \n" // 4 floats + "fcvtl v3.4s, v1.4h \n" // 4 more floats + "stp q2, q3, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + "cmp %w2, #1 \n" // Is there 1 value? + "b.lo 3f \n" + "2: \n" + "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats + "subs %w2, %w2, #1 \n" // 1 floats per loop + "fcvtl v2.4s, v1.4h \n" // 1 floats + "str s2, [%1], #4 \n" // store 1 floats + "b.gt 2b \n" + "3: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)(src_stride * 2)) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + // Convert FP32 Floats to FP16 Half Floats void ConvertFP32ToFP16Row_NEON(const float* src, uint16_t* dst, // fp16 diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index 99316c48..749deccd 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -75,7 +75,6 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_odd, v_even, v_dst; - vuint16m8_t v_sum; vuint32m4_t v_odd_32, v_even_32; size_t vl = __riscv_vsetvl_e32m4(w); __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl); @@ -499,7 +498,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, vuint8m4_t v_u0v0, v_u1v1, v_avg; vuint16m4_t v_u0v0_16, v_u1v1_16; size_t vl = __riscv_vsetvl_e16m4(w); - vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl); + __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl); v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16); v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16); // Use round-to-nearest-up mode for averaging add diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 1019a7b3..ec1d72eb 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -30,9 +30,9 @@ #endif #if defined(LIBYUV_BIT_EXACT) -#define EXPECTED_ATTENUATE_DIFF 0 +#define EXPECTED_UNATTENUATE_DIFF 0 #else -#define EXPECTED_ATTENUATE_DIFF 2 +#define EXPECTED_UNATTENUATE_DIFF 2 #endif namespace libyuv { @@ -57,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { orig_pixels[2 * 4 + 0] = 16u; orig_pixels[2 * 4 + 1] = 64u; orig_pixels[2 * 4 + 2] = 192u; - orig_pixels[2 * 4 + 3] = 255u; + orig_pixels[2 * 4 + 3] = 128u; orig_pixels[3 * 4 + 0] = 16u; orig_pixels[3 * 4 + 1] = 64u; orig_pixels[3 * 4 + 2] = 192u; - orig_pixels[3 * 4 + 3] = 128u; - ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1); + orig_pixels[3 * 4 + 3] = 255u; + orig_pixels[4 * 4 + 0] = 255u; + orig_pixels[4 * 4 + 1] = 255u; + orig_pixels[4 * 4 + 2] = 255u; + orig_pixels[4 * 4 + 3] = 255u; + + ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]); EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]); @@ -71,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]); - EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]); - EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]); - EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]); - EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]); - EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]); - EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]); - EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]); - EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]); + EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]); + EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]); + EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]); + EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]); + EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]); + EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]); + EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]); + EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]); + EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]); + + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1); + EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]); + EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]); + EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]); + EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]); + EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]); + EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]); + EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]); + EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]); + EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]); + EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]); + EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]); + EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]); + + // test 255 + for (int i = 0; i < 256; ++i) { + orig_pixels[i * 4 + 0] = i; + orig_pixels[i * 4 + 1] = 0; + orig_pixels[i * 4 + 2] = 0; + orig_pixels[i * 4 + 3] = 255; + } + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1); + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]); + EXPECT_EQ(0, atten_pixels[i * 4 + 1]); + EXPECT_EQ(0, atten_pixels[i * 4 + 2]); + EXPECT_EQ(255, atten_pixels[i * 4 + 3]); + } for (int i = 0; i < 1280; ++i) { orig_pixels[i * 4 + 0] = i; @@ -92,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1); } for (int i = 0; i < 1280; ++i) { - EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2); - EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2); + EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1); + EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1); } // Make sure transparent, 50% and opaque are fully accurate. EXPECT_EQ(0, atten_pixels[0 * 4 + 0]); @@ -106,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); - EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF); - EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF); - EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(255, atten_pixels[255 * 4 + 0]); + EXPECT_EQ(127, atten_pixels[255 * 4 + 1]); + EXPECT_EQ(85, atten_pixels[255 * 4 + 2]); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); @@ -165,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_EQ(max_diff, 0); } static int TestUnattenuateI(int width, @@ -238,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); + EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) { @@ -2756,7 +2802,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { MaskCpuFlags(benchmark_cpu_info_); ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, - benchmark_width_, benchmark_width_, benchmark_height_); + benchmark_width_, benchmark_width_, benchmark_height_); double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, @@ -2764,8 +2810,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { } opt_time = (get_time() - opt_time) / benchmark_iterations_; // Report performance of C vs OPT - printf("%8d us C - %8d us OPT\n", - static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); + printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6), + static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -2804,8 +2850,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { opt_time = (get_time() - opt_time) / benchmark_iterations_; // Report performance of C vs OPT - printf("%8d us C - %8d us OPT\n", - static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); + printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6), + static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -4531,6 +4577,43 @@ TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) { free_aligned_buffer_page_end(rec_opt); } +TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) { + int i, j; + const int y_plane_size = benchmark_width_ * benchmark_height_; + + align_buffer_page_end(orig_f, y_plane_size * 4); + align_buffer_page_end(orig_y, y_plane_size * 2); + align_buffer_page_end(dst_opt, y_plane_size * 4); + align_buffer_page_end(rec_opt, y_plane_size * 2); + + for (i = 0; i < y_plane_size; ++i) { + ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f; + } + memset(orig_y, 1, y_plane_size * 2); + memset(dst_opt, 2, y_plane_size * 4); + memset(rec_opt, 3, y_plane_size * 2); + + ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y, + y_plane_size); + + for (j = 0; j < benchmark_iterations_; j++) { + ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt, + y_plane_size); + } + + ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt, + y_plane_size); + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]); + } + + free_aligned_buffer_page_end(orig_f); + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(rec_opt); +} + #endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__) } // namespace libyuv |