diff options
author | Frank Barchard <fbarchard@google.com> | 2023-02-13 10:52:58 -0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-02-13 20:14:57 +0000 |
commit | 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 (patch) | |
tree | c446d71538c965d0e5391ef77cd49b45ba51463d | |
parent | b2528b0be934de1918e20c85fc170d809eeb49ab (diff) | |
download | libyuv-2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2.tar.gz |
MergeUV_AVX512BW for I420ToNV12
On Skylake Xeon 640x360 100000 iterations
AVX512 MergeUVPlane_Opt (1196 ms)
AVX2 MergeUVPlane_Opt (1565 ms)
SSE2 MergeUVPlane_Opt (1780 ms)
Pixel 7 MergeUVPlane_Opt (1177 ms)
Bug: None
Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/rotate_row.h | 21 | ||||
-rw-r--r-- | include/libyuv/row.h | 12 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert.cc | 8 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 40 | ||||
-rw-r--r-- | source/planar_functions.cc | 8 | ||||
-rw-r--r-- | source/rotate_common.cc | 57 | ||||
-rw-r--r-- | source/rotate_neon64.cc | 39 | ||||
-rw-r--r-- | source/row_any.cc | 3 | ||||
-rw-r--r-- | source/row_gcc.cc | 21 | ||||
-rw-r--r-- | source/row_lasx.cc | 110 | ||||
-rw-r--r-- | source/row_lsx.cc | 80 | ||||
-rw-r--r-- | source/row_neon64.cc | 22 | ||||
-rw-r--r-- | source/scale.cc | 20 | ||||
-rw-r--r-- | source/scale_common.cc | 6 | ||||
-rw-r--r-- | unit_test/rotate_test.cc | 47 |
17 files changed, 390 insertions, 108 deletions
diff --git a/README.chromium b/README.chromium index 77eac367..130a916a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1857 +Version: 1860 License: BSD License File: LICENSE diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index b773f886..64d0b59f 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -232,6 +232,27 @@ void TransposeWx1_16_C(const uint16_t* src, uint16_t* dst, int dst_stride, int width); + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void Transpose4x4_32_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +// Transpose 32 bit values (ARGB) +void Transpose8x8_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8d998727..e1837169 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -402,9 +402,11 @@ extern "C" { // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 +// TODO(fbarchard): Port MERGEUV to assembly #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512)) + (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER)) #define HAS_ARGBTORGB24ROW_AVX512VBMI +#define HAS_MERGEUVROW_AVX512BW #endif // The following are available for AVX512 clang x64 platforms: @@ -2184,6 +2186,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_AVX512BW(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -2204,6 +2210,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void MergeUVRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index adc49c4e..3e877f38 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1857 +#define LIBYUV_VERSION 1860 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 15c70a65..5ba6e580 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -924,6 +924,14 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index d548aec2..1b8572a0 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -389,6 +389,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -559,6 +567,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -726,6 +742,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -894,6 +918,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -2921,6 +2953,14 @@ int RAWToJNV21(const uint8_t* src_raw, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e08a44f6..0f89d269 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -599,6 +599,14 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; diff --git a/source/rotate_common.cc b/source/rotate_common.cc index 2617c01b..4b496d1b 100644 --- a/source/rotate_common.cc +++ b/source/rotate_common.cc @@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src, } } +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + int i; + for (i = 0; i < width; i += 4) { + uint32_t p00 = ((uint32_t*)(src))[0]; + uint32_t p10 = ((uint32_t*)(src))[1]; + uint32_t p20 = ((uint32_t*)(src))[2]; + uint32_t p30 = ((uint32_t*)(src))[3]; + uint32_t p01 = ((uint32_t*)(src1))[0]; + uint32_t p11 = ((uint32_t*)(src1))[1]; + uint32_t p21 = ((uint32_t*)(src1))[2]; + uint32_t p31 = ((uint32_t*)(src1))[3]; + uint32_t p02 = ((uint32_t*)(src2))[0]; + uint32_t p12 = ((uint32_t*)(src2))[1]; + uint32_t p22 = ((uint32_t*)(src2))[2]; + uint32_t p32 = ((uint32_t*)(src2))[3]; + uint32_t p03 = ((uint32_t*)(src3))[0]; + uint32_t p13 = ((uint32_t*)(src3))[1]; + uint32_t p23 = ((uint32_t*)(src3))[2]; + uint32_t p33 = ((uint32_t*)(src3))[3]; + ((uint32_t*)(dst))[0] = p00; + ((uint32_t*)(dst))[1] = p01; + ((uint32_t*)(dst))[2] = p02; + ((uint32_t*)(dst))[3] = p03; + ((uint32_t*)(dst1))[0] = p10; + ((uint32_t*)(dst1))[1] = p11; + ((uint32_t*)(dst1))[2] = p12; + ((uint32_t*)(dst1))[3] = p13; + ((uint32_t*)(dst2))[0] = p20; + ((uint32_t*)(dst2))[1] = p21; + ((uint32_t*)(dst2))[2] = p22; + ((uint32_t*)(dst2))[3] = p23; + ((uint32_t*)(dst3))[0] = p30; + ((uint32_t*)(dst3))[1] = p31; + ((uint32_t*)(dst3))[2] = p32; + ((uint32_t*)(dst3))[3] = p33; + src += src_stride * 4; // advance 4 rows + src1 += src_stride * 4; + src2 += src_stride * 4; + src3 += src_stride * 4; + dst += 4 * 4; // advance 4 columns + dst1 += 4 * 4; + dst2 += 4 * 4; + dst3 += 4 * 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index ea1cf82c..95047fa7 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -435,6 +435,45 @@ void TransposeUVWx8_NEON(const uint8_t* src, : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n" + "subs %w8, %w8, #4 \n" // w -= 4 + "st1 {v0.4s}, [%4], 16 \n" + "st1 {v1.4s}, [%5], 16 \n" + "st1 {v2.4s}, [%6], 16 \n" + "st1 {v3.4s}, [%7], 16 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(dst1), // %5 + "+r"(dst2), // %6 + "+r"(dst3), // %7 + "+r"(width) // %8 + : "r"((ptrdiff_t)(src_stride * 4)) // %9 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/row_any.cc b/source/row_any.cc index 012f0fb2..fae4b8a2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -571,6 +571,9 @@ ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #ifdef HAS_MERGEUVROW_AVX2 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #endif +#ifdef HAS_MERGEUVROW_AVX512BW +ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) +#endif #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index f36d0cf0..79c158a9 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -17,6 +17,8 @@ extern "C" { // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#include <immintrin.h> + #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB @@ -5142,6 +5144,25 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, } #endif // HAS_DETILESPLITUVROW_SSSE3 +#ifdef HAS_MERGEUVROW_AVX512BW +__attribute__ ((target("avx512vl,avx512bw"))) +void MergeUVRow_AVX512BW(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + do { + const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u)); + const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8); + const __m512i uv = _mm512_or_si512(u, v); + _mm512_storeu_epi8(dst_uv, uv); + src_u += 32; + src_v += 32; + dst_uv += 64; + width -= 32; + } while (width > 0); +} +#endif // HAS_MERGEUVROW_AVX512BW + #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/row_lasx.cc b/source/row_lasx.cc index f824906d..29ac9254 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2036,8 +2036,8 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile( + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2047,19 +2047,21 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, "xvld $xr4, %0, 0 \n\t" "xvld $xr5, %0, 32 \n\t" "xvld $xr6, %0, 64 \n\t" - "xvld $xr7, %0, 96 \n\t" // load 32 pixels of ARGB + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // ARGB "xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per loop. - "xvpickev.b $xr8, $xr5, $xr4 \n\t" //BR + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR "xvpickev.b $xr10, $xr7, $xr6 \n\t" - "xvpickod.b $xr9, $xr5, $xr4 \n\t" //GA + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA "xvpickod.b $xr11, $xr7, $xr6 \n\t" - "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" //B + "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t" - "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" //G + "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t" - "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" //R + "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t" "addi.d %0, %0, 128 \n\t" "xvpickod.b $xr10, $xr13, $xr12 \n\t" @@ -2067,13 +2069,11 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, "xvst $xr11, %1, 0 \n\t" "addi.d %1, %1, 32 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_argb), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), - "r"(shuff) - : "memory" - ); + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); } void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -2098,8 +2098,8 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile( + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2109,19 +2109,21 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, "xvld $xr4, %0, 0 \n\t" "xvld $xr5, %0, 32 \n\t" "xvld $xr6, %0, 64 \n\t" - "xvld $xr7, %0, 96 \n\t" // load 32 pixels of RGBA + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // RGBA "xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per loop. - "xvpickev.b $xr8, $xr5, $xr4 \n\t" //AG + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG "xvpickev.b $xr10, $xr7, $xr6 \n\t" - "xvpickod.b $xr9, $xr5, $xr4 \n\t" //BR + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR "xvpickod.b $xr11, $xr7, $xr6 \n\t" - "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" //B + "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t" - "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" //G + "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t" - "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" //R + "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t" "addi.d %0, %0, 128 \n\t" "xvpickod.b $xr10, $xr13, $xr12 \n\t" @@ -2129,13 +2131,11 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, "xvst $xr11, %1, 0 \n\t" "addi.d %1, %1, 32 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(rgbconstants), - "r"(shuff) - : "memory" - ); + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); } void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -2151,18 +2151,19 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { } static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { - int8_t shuff[128] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile( + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[128] = { + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2174,23 +2175,25 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, "1: \n\t" "xvld $xr8, %0, 0 \n\t" "xvld $xr9, %0, 32 \n\t" - "xvld $xr10, %0, 64 \n\t" // load 32 pixels of RGB + "xvld $xr10, %0, 64 \n\t" // load 32 pixels of + // RGB "xvor.v $xr12, $xr3, $xr3 \n\t" "xvor.v $xr13, $xr3, $xr3 \n\t" "xvor.v $xr11, $xr9, $xr9 \n\t" - "addi.d %2, %2, -32 \n\t" // 32 processed per loop. - "xvpermi.q $xr9, $xr8, 0x30 \n\t" //src0 - "xvpermi.q $xr8, $xr10, 0x03 \n\t" //src1 - "xvpermi.q $xr10, $xr11, 0x30 \n\t" //src2 + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 + "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 + "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t" "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t" "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t" "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t" - "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" //G + "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t" - "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" //B + "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t" - "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" //R + "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t" "addi.d %0, %0, 96 \n\t" "xvpickod.b $xr10, $xr13, $xr12 \n\t" @@ -2202,8 +2205,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, "+&r"(width) // %2 : "r"(rgbconstants), // %3 "r"(shuff) // %4 - : "memory" - ); + : "memory"); } void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 0825b633..9c1e16f2 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -1679,7 +1679,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1688,31 +1688,32 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, "vld $vr4, %0, 0 \n\t" "vld $vr5, %0, 16 \n\t" "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of ARGB + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // ARGB "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" //BR + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" //GA + "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" //G + "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t" "addi.d %0, %0, 64 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" "vst $vr10, %1, 0 \n\t" "addi.d %1, %1, 16 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_argb), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 : "r"(rgbconstants) - : "memory" - ); + : "memory"); } void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1737,7 +1738,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1746,31 +1747,32 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, "vld $vr4, %0, 0 \n\t" "vld $vr5, %0, 16 \n\t" "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of RGBA + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // RGBA "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" //AG + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" //BR + "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" //G + "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t" "addi.d %0, %0, 64 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" "vst $vr10, %1, 0 \n\t" "addi.d %1, %1, 16 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 : "r"(rgbconstants) - : "memory" - ); + : "memory"); } void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -1789,11 +1791,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile( + int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, + 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, + 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, + 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, + 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1805,19 +1808,21 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, "1: \n\t" "vld $vr8, %0, 0 \n\t" "vld $vr9, %0, 16 \n\t" - "vld $vr10, %0, 32 \n\t" // load 16 pixels of RGB + "vld $vr10, %0, 32 \n\t" // load 16 pixels of + // RGB "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t" - "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" //G + "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t" - "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t" "addi.d %0, %0, 48 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" @@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, "+&r"(width) // %2 : "r"(rgbconstants), // %3 "r"(shuff) // %4 - : "memory" - ); + : "memory"); } void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 7f04b606..df346ee0 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u, : "cc", "memory", "v0", "v1" // Clobber List ); } +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON1(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U + "ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V + "subs %w3, %w3, #32 \n" // 32 processed per loop + "prfm pldl1keep, [%0, 448] \n" + "prfm pldl1keep, [%1, 448] \n" + "st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, diff --git a/source/scale.cc b/source/scale.cc index 65a4685f..830754ce 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1065,10 +1065,10 @@ void ScalePlaneBilinearDown(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1188,10 +1188,10 @@ void ScalePlaneBilinearDown_16(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1276,10 +1276,10 @@ void ScalePlaneBilinearUp(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1744,10 +1744,10 @@ void ScalePlaneBilinearUp_16(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1872,7 +1872,7 @@ static void ScalePlaneSimple(int src_width, const uint8_t* src_ptr, uint8_t* dst_ptr) { int i; - void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1909,7 +1909,7 @@ static void ScalePlaneSimple_16(int src_width, const uint16_t* src_ptr, uint16_t* dst_ptr) { int i; - void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, + void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; diff --git a/source/scale_common.cc b/source/scale_common.cc index f1832403..da9ca713 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1633,7 +1633,7 @@ void ScalePlaneVertical(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1712,7 +1712,7 @@ void ScalePlaneVertical_16(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, + void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1791,7 +1791,7 @@ void ScalePlaneVertical_16To8(int src_height, // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions. - void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb, + void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int scale, int dst_width, int source_y_fraction) = InterpolateRow_16To8_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index 9dec7811..e8d2ca16 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -14,6 +14,10 @@ #include "libyuv/cpu_id.h" #include "libyuv/rotate.h" +#ifdef ENABLE_ROW_TESTS +#include "libyuv/rotate_row.h" +#endif + namespace libyuv { #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) @@ -858,4 +862,47 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) { disable_cpu_flags_, benchmark_cpu_info_); } +#if defined(ENABLE_ROW_TESTS) + +TEST_F(LibYUVRotateTest, Transpose4x4) { + // dst width and height + const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3; + const int height = 4; + align_buffer_page_end(src_pixels, height * width * 4); + align_buffer_page_end(dst_pixels_c, width * height * 4); + align_buffer_page_end(dst_pixels_opt, width * height * 4); + + MemRandomize(src_pixels, height * width * 4); + memset(dst_pixels_c, 1, width * height * 4); + memset(dst_pixels_opt, 1, width * height * 4); + + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_c, width * 4, width); + + for (int i = 0; i < benchmark_iterations_; ++i) { +#if defined(__aarch64__) + if (TestCpuFlag(kCpuHasNEON)) { + Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else { + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } +#else + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); +#endif + } + + // for (int i = 0; i < width * height; ++i) { + // EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + // } + + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_opt); +} + +#endif // ENABLE_ROW_TESTS + } // namespace libyuv |