diff options
author | Wan-Teh Chang <wtc@google.com> | 2023-05-20 16:09:43 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-20 23:29:33 +0000 |
commit | 179b0203e5bd9eeff84de30e03b2517fd3bcb654 (patch) | |
tree | 253baae5779ca6acaf3b7fa54ddd58b40b9d7cc2 | |
parent | 8670bcf17faca69603d2d49fa6ebd0e45123471f (diff) | |
download | libyuv-179b0203e5bd9eeff84de30e03b2517fd3bcb654.tar.gz |
Enable {J400/I400}ToARGBRow_RVV
Run on SiFive internal FPGA*:
I400ToARGB_Opt (~8x vs scalar)
J400ToARGB_Opt (~10x vs scalar)
LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10
Bug: libyuv:956, libyuv:961
Change-Id: If4e21ec85c4ff79083ec16a6faae0e457129a8de
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4544972
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Wan-Teh Chang <wtc@google.com>
-rw-r--r-- | include/libyuv/row.h | 7 | ||||
-rw-r--r-- | source/convert_argb.cc | 11 | ||||
-rw-r--r-- | source/row_rvv.cc | 56 |
3 files changed, 74 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a390a68d..3488d256 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -807,6 +807,7 @@ extern "C" { #define HAS_ABGRTOYROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_BGRATOYROW_RVV +#define HAS_I400TOARGBROW_RVV #define HAS_I422ALPHATOARGBROW_RVV #define HAS_I422TOARGBROW_RVV #define HAS_I422TORGB24ROW_RVV @@ -814,6 +815,7 @@ extern "C" { #define HAS_I444ALPHATOARGBROW_RVV #define HAS_I444TOARGBROW_RVV #define HAS_I444TORGB24ROW_RVV +#define HAS_J400TOARGBROW_RVV #define HAS_MERGEARGBROW_RVV #define HAS_MERGERGBROW_RVV #define HAS_MERGEXRGBROW_RVV @@ -3531,6 +3533,7 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -4460,6 +4463,10 @@ void I400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 99bbb96d..f16c368d 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2897,6 +2897,11 @@ int I400ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I400ToARGBRow = I400ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, yuvconstants, width); @@ -2984,6 +2989,12 @@ int J400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + J400ToARGBRow = J400ToARGBRow_RVV; + } +#endif + for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 904cbf7d..89333321 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -474,6 +474,62 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, } while (w > 0); } +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + vuint16m4_t v_yb; + vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) sets to round-down mode(2). + asm volatile("csrwi vxrm, 2"); + if (is_yb_positive) { + v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4], vl); + } else { + v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4], vl); + } + do { + vuint8m2_t v_y, v_out; + vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; + vuint32m8_t v_y1; + vl = __riscv_vsetvl_e8m2(w); + v_y = __riscv_vle8_v_u8m2(src_y, vl); + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); + v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y + v_y1 = __riscv_vwmulu_vv_u32m8(v_tmp0, v_yg, vl); + v_tmp1 = __riscv_vnsrl_wx_u16m4(v_y1, 16, vl); + if (is_yb_positive) { + v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); + } else { + v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); + } + v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y; + v_y = __riscv_vle8_v_u8m2(src_y, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, |