diff options
author | Bruce Lai <bruce.lai@sifive.com> | 2023-04-09 22:42:30 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-04-20 19:49:55 +0000 |
commit | 1330a79e9fcd86d06c26b5be861a6ff7946893f4 (patch) | |
tree | 6f4731cdbc7e8b3fae163256dd8a2437508264d4 | |
parent | 77c2121f7e6b8e694d6e908bbbe9be24214097da (diff) | |
download | libyuv-1330a79e9fcd86d06c26b5be861a6ff7946893f4.tar.gz |
Optimized AR64/AB64 <-> ARGB with RVV
* Run on SiFive internal FPGA:
ARGBToAR64_Opt (~13.7x vs scalar)
ARGBToAB64_Opt (~5.81x vs scalar)
AR64ToARGB_Opt (~15.8x vs scalar)
AB64ToARGB_Opt (~2.40x vs scalar)
LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10
Bug: libyuv:956
Change-Id: Ida642a5077f59d25fb7c5328f671956b2293dadd
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4442913
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 8 | ||||
-rw-r--r-- | source/convert_argb.cc | 10 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 10 | ||||
-rw-r--r-- | source/row_rvv.cc | 73 |
4 files changed, 100 insertions, 1 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 6140443b..176b3781 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -758,6 +758,10 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv) +#define HAS_AB64TOARGBROW_RVV +#define HAS_AR64TOARGBROW_RVV +#define HAS_ARGBTOAB64ROW_RVV +#define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORGB24ROW_RVV #define HAS_RAWTOARGBROW_RVV @@ -3241,6 +3245,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index f490e9c1..c797a756 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3594,6 +3594,11 @@ int AR64ToARGB(const uint16_t* src_ar64, } } #endif +#if defined(HAS_AR64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AR64ToARGBRow = AR64ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { AR64ToARGBRow(src_ar64, dst_argb, width); @@ -3653,6 +3658,11 @@ int AB64ToARGB(const uint16_t* src_ab64, } } #endif +#if defined(HAS_AB64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AB64ToARGBRow = AB64ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { AB64ToARGBRow(src_ab64, dst_argb, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index e5608adb..47ee3437 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -2751,6 +2751,11 @@ int ARGBToAR64(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOAR64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAR64Row = ARGBToAR64Row_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToAR64Row(src_argb, dst_ar64, width); @@ -2810,6 +2815,11 @@ int ARGBToAB64(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOAB64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAB64Row = ARGBToAB64Row_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToAB64Row(src_argb, dst_ab64, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 629eca46..bd21d44e 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -12,7 +12,7 @@ * Copyright (c) 2023 SiFive, Inc. All rights reserved. * * Contributed by Darren Hsieh <darren.hsieh@sifive.com> - * + * Contributed by Bruce Lai <bruce.lai@sifive.com> */ #include <assert.h> @@ -27,6 +27,77 @@ namespace libyuv { extern "C" { #endif +void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e8m4(avl); + v_argb = __riscv_vle8_v_u8m4(src_argb, vl); + v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl); + v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl); + __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl); + avl -= vl; + src_argb += vl; + dst_ar64 += vl; + } while (avl > 0); +} + +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m1(avl); + __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} + +void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e16m8(avl); + v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl); + v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl); + __riscv_vse8_v_u8m4(dst_argb, v_argb, vl); + avl -= vl; + src_ar64 += vl; + dst_argb += vl; + } while (avl > 0); +} + +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e16m2(avl); + __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); + v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} + void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t vl = __riscv_vsetvl_e8m2(width); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); |