diff options
author | Bruce Lai <bruce.lai@sifive.com> | 2023-06-15 04:56:58 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-06-15 23:45:24 +0000 |
commit | 04821d1e7d60845525e8db55c7bcd41ef5be9406 (patch) | |
tree | 46b29e13d76928a8423b346d085eda3eb215e4bb | |
parent | 552571e8b24b2619c39ec176e6cb8e75d3e7fdd3 (diff) | |
download | libyuv-04821d1e7d60845525e8db55c7bcd41ef5be9406.tar.gz |
[RVV] Enable ARGBExtractAlphaRow/ARGBCopyYToAlphaRow
* Run on SiFive internal FPGA:
TestARGBExtractAlpha(~3.2x vs scalar)
TestARGBCopyYToAlpha(~1.6x vs scalar)
Change-Id: I36525c67e8ac3f71ea9d1a58c7dc15a4009d9da1
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4617955
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 6 | ||||
-rw-r--r-- | source/convert.cc | 5 | ||||
-rw-r--r-- | source/planar_functions.cc | 10 | ||||
-rw-r--r-- | source/row_rvv.cc | 29 | ||||
-rw-r--r-- | unit_test/planar_test.cc | 27 |
5 files changed, 75 insertions, 2 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7b866d41..8df334b1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -798,6 +798,8 @@ extern "C" { #define HAS_AB64TOARGBROW_RVV #define HAS_AR64TOARGBROW_RVV #define HAS_ARGBATTENUATEROW_RVV +#define HAS_ARGBCOPYYTOALPHAROW_RVV +#define HAS_ARGBEXTRACTALPHAROW_RVV #define HAS_ARGBTOAB64ROW_RVV #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV @@ -3081,6 +3083,9 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width); void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3100,6 +3105,7 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr, void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/source/convert.cc b/source/convert.cc index b11ab1bf..b68fb1d3 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2128,6 +2128,11 @@ int ARGBToI420Alpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_LSX; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d115a2a1..dcc37836 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -5340,6 +5340,11 @@ int ARGBExtractAlpha(const uint8_t* src_argb, : ARGBExtractAlphaRow_Any_LSX; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); @@ -5391,6 +5396,11 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 27e91a3b..a79560c7 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -948,6 +948,35 @@ void ARGBAttenuateRow_RVV(const uint8_t* src_argb, } while (w > 0); } +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e8_v_u8m2(&v_r, &v_g, &v_b, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} + +void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + const ptrdiff_t dst_stride = 4; + dst += 3; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl); + __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl); + w -= vl; + src += vl; + dst += vl * dst_stride; + } while (w > 0); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index e990f36b..1019a7b3 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2749,12 +2749,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { MaskCpuFlags(disable_cpu_flags_); ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, benchmark_width_, benchmark_width_, benchmark_height_); - MaskCpuFlags(benchmark_cpu_info_); + double c_time = get_time(); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_, benchmark_width_, benchmark_height_); + c_time = (get_time() - c_time); + MaskCpuFlags(benchmark_cpu_info_); + ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_, benchmark_width_, benchmark_height_); + double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, benchmark_width_, benchmark_width_, benchmark_height_); } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + // Report performance of C vs OPT + printf("%8d us C - %8d us OPT\n", + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -2777,12 +2788,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { MaskCpuFlags(disable_cpu_flags_); ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); - MaskCpuFlags(benchmark_cpu_info_); + double c_time = get_time(); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + c_time = (get_time() - c_time); + MaskCpuFlags(benchmark_cpu_info_); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + + // Report performance of C vs OPT + printf("%8d us C - %8d us OPT\n", + static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } |