From 68659d0d681b4c2318407f7dbc6eaa40055adba1 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 12 Apr 2023 15:32:23 -0700 Subject: UVScale down by 2 fix for C and optimize for NEON - update cpu_id to use "re" for fopen to avoid leaking handles if a thread is started while the file is open. Bug: libyuv:958 Change-Id: I1af9de68fce12e440e1226fc8070634ccb1bf090 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4417176 Reviewed-by: Wan-Teh Chang Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/scale_row.h | 2 ++ include/libyuv/version.h | 2 +- source/cpu_id.cc | 6 ++-- source/scale_any.cc | 16 ++++++++++ source/scale_common.cc | 15 +++------ source/scale_neon.cc | 39 +++++++++++++++++++++++ source/scale_neon64.cc | 39 +++++++++++++++++++++++ source/scale_uv.cc | 34 ++++++++++---------- unit_test/scale_uv_test.cc | 78 +++++++++++++++------------------------------- 10 files changed, 148 insertions(+), 85 deletions(-) diff --git a/README.chromium b/README.chromium index 16398820..51ee6633 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1865 +Version: 1866 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 7996ea05..a7957c3f 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -133,6 +133,8 @@ extern "C" { #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEUVROWDOWN2_NEON +#define HAS_SCALEUVROWDOWN2LINEAR_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWNEVEN_NEON #define HAS_SCALEROWUP2_LINEAR_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c2b342ef..a7a65635 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1865 +#define LIBYUV_VERSION 1866 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 409456e8..efad1b36 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -137,7 +137,7 @@ static int GetXCR0() { // For Arm, but public to allow testing on any CPU LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; - FILE* f = fopen(cpuinfo_name, "r"); + FILE* f = fopen(cpuinfo_name, "re"); if (!f) { // Assume Neon if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. @@ -166,7 +166,7 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; int flag = 0x0; - FILE* f = fopen(cpuinfo_name, "r"); + FILE* f = fopen(cpuinfo_name, "re"); if (!f) { // Assume nothing if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. @@ -194,7 +194,7 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; int flag = 0x0; - FILE* f = fopen(cpuinfo_name, "r"); + FILE* f = fopen(cpuinfo_name, "re"); if (!f) { // Assume nothing if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. diff --git a/source/scale_any.cc b/source/scale_any.cc index 317041f8..f6576874 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2_NEON +SDANY(ScaleUVRowDown2_Any_NEON, + ScaleUVRowDown2_NEON, + ScaleUVRowDown2_C, + 2, + 2, + 7) +#endif +#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON +SDANY(ScaleUVRowDown2Linear_Any_NEON, + ScaleUVRowDown2Linear_NEON, + ScaleUVRowDown2Linear_C, + 2, + 2, + 7) +#endif #ifdef HAS_SCALEUVROWDOWN2BOX_NEON SDANY(ScaleUVRowDown2Box_Any_NEON, ScaleUVRowDown2Box_NEON, diff --git a/source/scale_common.cc b/source/scale_common.cc index da9ca713..5e603fd4 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1280,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); int x; (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = src_uv[2]; // Store the 2nd UV + dst_uv[1] = src_uv[3]; + src_uv += 4; + dst_uv += 2; } } diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 6a0d6e1b..ccc75106 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.16 {q1}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.16 {q0}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 9f9636e6..ad06ee83 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1568,6 +1568,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 1b5b1db1..50daa566 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -112,6 +112,22 @@ static void ScaleUVDown2(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif // This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_SSSE3) @@ -130,23 +146,7 @@ static void ScaleUVDown2(int src_width, } } #endif -// This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON - : ScaleUVRowDown2Box_Any_NEON); - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON - : ScaleUVRowDown2Box_NEON); - } - } -#endif + #if defined(HAS_SCALEUVROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVRowDown2 = diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc index 3d524bef..84908a91 100644 --- a/unit_test/scale_uv_test.cc +++ b/unit_test/scale_uv_test.cc @@ -39,55 +39,35 @@ static int UVTestFilter(int src_width, return 0; } - int i, j; - const int b = 0; // 128 to test for padding/stride. - int64_t src_uv_plane_size = - (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL; - int src_stride_uv = (b * 2 + Abs(src_width)) * 2; + int i; + int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL; + int src_stride_uv = Abs(src_width) * 2; + int64_t dst_uv_plane_size = dst_width * dst_height * 2LL; + int dst_stride_uv = dst_width * 2; align_buffer_page_end(src_uv, src_uv_plane_size); - if (!src_uv) { - printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); - return 0; - } - MemRandomize(src_uv, src_uv_plane_size); - - int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL; - int dst_stride_uv = (b * 2 + dst_width) * 2; - align_buffer_page_end(dst_uv_c, dst_uv_plane_size); align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); - if (!dst_uv_c || !dst_uv_opt) { + + if (!src_uv || !dst_uv_c || !dst_uv_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } + MemRandomize(src_uv, src_uv_plane_size); memset(dst_uv_c, 2, dst_uv_plane_size); - memset(dst_uv_opt, 3, dst_uv_plane_size); - - // Warm up both versions for consistent benchmarks. - MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); - MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + memset(dst_uv_opt, 123, dst_uv_plane_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); - + UVScale(src_uv, src_stride_uv, src_width, src_height, + dst_uv_c, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv, src_stride_uv, src_width, src_height, + dst_uv_opt, dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; @@ -95,18 +75,11 @@ static int UVTestFilter(int src_width, printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); - // C version may be a little off from the optimized. Order of - // operations may introduce rounding somewhere. So do a difference - // of the buffers and look to see that the max difference isn't - // over 2. int max_diff = 0; - for (i = b; i < (dst_height + b); ++i) { - for (j = b * 2; j < (dst_width + b) * 2; ++j) { - int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - - dst_uv_opt[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) { - max_diff = abs_diff; - } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; } } @@ -121,28 +94,27 @@ static int UVTestFilter(int src_width, #define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast((x / nom) * denom) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ +#define TEST_FACTOR1(name, filter, nom, denom) \ TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \ int diff = UVTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ + EXPECT_EQ(0, diff); \ } #if defined(ENABLE_FULL_TESTS) -// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but -// filtering is different fixed point implementations for SSSE3, Neon and C. +// Test a scale factor with all 4 filters. Expect exact for SIMD vs C. #define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, 3) + TEST_FACTOR1(name, None, nom, denom) \ + TEST_FACTOR1(name, Linear, nom, denom) \ + TEST_FACTOR1(name, Bilinear, nom, denom) \ + TEST_FACTOR1(name, Box, nom, denom) #else // Test a scale factor with Bilinear. #define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) + TEST_FACTOR1(name, Bilinear, nom, denom) #endif TEST_FACTOR(2, 1, 2) -- cgit v1.2.3