diff options
author | Bruce Lai <bruce.lai@sifive.com> | 2023-07-07 05:09:34 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-07-25 16:38:55 +0000 |
commit | d33edd237313a03a06210dfa392c6c961d769a6c (patch) | |
tree | 34f63b25ec8d33f3db84edafe1c322352eb4971c | |
parent | b76fcd465408d9daf23e15d9dcd1d17df1397e8f (diff) | |
download | libyuv-d33edd237313a03a06210dfa392c6c961d769a6c.tar.gz |
[RVV] Enable ARGBBlendRow_RVV/BlendPlaneRow_RVV
* Run on SiFive internal FPGA:
Test case Speedup
ARGBBlend_Opt 4.60
BlendPlane_Opt 5.96
I420Blend_Opt 5.83
- Also, add code to use ScaleRowDown2Box_RVV in I420Blend
Change-Id: Icc75e05d26b3427a98269d2a33c4474074033264
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4681100
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 11 | ||||
-rw-r--r-- | source/planar_functions.cc | 20 | ||||
-rw-r--r-- | source/row_rvv.cc | 75 |
3 files changed, 106 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index c0cb251d..7fce28b3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -794,6 +794,7 @@ extern "C" { #define HAS_AB64TOARGBROW_RVV #define HAS_AR64TOARGBROW_RVV #define HAS_ARGBATTENUATEROW_RVV +#define HAS_ARGBBLENDROW_RVV #define HAS_ARGBCOPYYTOALPHAROW_RVV #define HAS_ARGBEXTRACTALPHAROW_RVV #define HAS_ARGBTOAB64ROW_RVV @@ -805,6 +806,7 @@ extern "C" { #define HAS_ABGRTOYROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_BGRATOYROW_RVV +#define HAS_BLENDPLANEROW_RVV #define HAS_COPYROW_RVV #define HAS_I400TOARGBROW_RVV #define HAS_I422ALPHATOARGBROW_RVV @@ -4541,6 +4543,10 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBBlendRow_RVV(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -4567,6 +4573,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void BlendPlaneRow_RVV(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index ca9d2151..f6ec0dac 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2832,6 +2832,11 @@ int ARGBBlend(const uint8_t* src_argb0, ARGBBlendRow = ARGBBlendRow_LSX; } #endif +#if defined(HAS_ARGBBLENDROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBBlendRow = ARGBBlendRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; @@ -2891,6 +2896,11 @@ int BlendPlane(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BlendPlaneRow = BlendPlaneRow_RVV; + } +#endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); @@ -2968,6 +2978,11 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_BLENDPLANEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BlendPlaneRow = BlendPlaneRow_RVV; + } +#endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } @@ -3004,6 +3019,11 @@ int I420Blend(const uint8_t* src_y0, } } #endif +#if defined(HAS_SCALEROWDOWN2_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleRowDown2 = ScaleRowDown2Box_RVV; + } +#endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index c8df61db..f99a6410 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -1070,6 +1070,81 @@ void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); } +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. +// src_argb: RGB values have already been pre-multiplied by the a. +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; + vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, + src_argb, vl); + __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, + src_argb1, vl); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} + +void BlendPlaneRow_RVV(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + size_t w = (size_t)width; + do { + vuint16m8_t v_dst_u16; + vuint8m4_t v_dst; + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl); + vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl); + + // (a * foreground) + (1-a) * background + v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl); + v_dst_u16 = + __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl); + v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl); + v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl); + + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src0 += vl; + src1 += vl; + alpha += vl; + dst += vl; + } while (w > 0); +} + // Attenuate: (f * a + 255) >> 8 void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, |