aboutsummaryrefslogtreecommitdiff
path: root/source/row_rvv.cc
diff options
context:
space:
mode:
authorDarren Hsieh <darren.hsieh@sifive.com>2023-05-23 02:03:37 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-05-30 09:10:35 +0000
commitd14bd701c83375a982fe9e237a71cc06bef056e7 (patch)
tree0ceb4ba8b400be8576df3aceb1afcc4dba0df919 /source/row_rvv.cc
parent78d168054b9b5c98eb0e9e9a95a09f252ea95be3 (diff)
downloadlibyuv-d14bd701c83375a982fe9e237a71cc06bef056e7.tar.gz
[RVV] Enable CopyRow_RVV, InterpolateRow_RVV, {Merge,Split}UVRow_RVV
* Run on SiFive internal FPGA: MergeUVPlane_Opt(~6x vs scalar) SplitUVPlane_Opt(~6x vs scalar) TestCopyPlane(~8x vs scalar) ARGBInterpolate0_Opt(~10x vs scalar) ARGBInterpolate64_Opt(~9x vs scalar) ARGBInterpolate168_Opt(~9x vs scalar) ARGBInterpolate192_Opt(~8.5x vs scalar) ARGBInterpolate255_Opt(~8x vs scalar) Bug: libyuv:956 Change-Id: I8372341865f75f42e30371ef943d5c2e4be7b79a Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4574186 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_rvv.cc')
-rw-r--r--source/row_rvv.cc105
1 files changed, 105 insertions, 0 deletions
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index be4c4a30..7297a401 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -528,6 +528,75 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
} while (w > 0);
}
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(w);
+ vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+ __riscv_vse8_v_u8m8(dst, v_data, vl);
+ w -= vl;
+ src += vl;
+ dst += vl;
+ } while (w > 0);
+}
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ size_t dst_w = (size_t)dst_width;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+ // Blend 100 / 0 - Copy row unchanged.
+ if (y1_fraction == 0) {
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(dst_w);
+ __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+ return;
+ }
+ // Blend 50 / 50.
+ if (y1_fraction == 128) {
+ do {
+ size_t vl = __riscv_vsetvl_e8m8(dst_w);
+ vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+ vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+ // Averaging add
+ vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+ __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ src_ptr1 += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+ return;
+ }
+ // General purpose row blend.
+ // To match behavior on other platforms, vxrm (fixed-point rounding mode
+ // register) is set to round-to-nearest-up(0).
+ asm volatile("csrwi vxrm, 0");
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(dst_w);
+ vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+ vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+ vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+ acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+ __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+ dst_w -= vl;
+ src_ptr += vl;
+ src_ptr1 += vl;
+ dst_ptr += vl;
+ } while (dst_w > 0);
+}
+
void SplitRGBRow_RVV(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -660,6 +729,42 @@ void MergeXRGBRow_RVV(const uint8_t* src_r,
} while (w > 0);
}
+void SplitUVRow_RVV(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ vuint8m4_t v_u, v_v;
+ __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+ __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+ __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+ w -= vl;
+ dst_u += vl;
+ dst_v += vl;
+ src_uv += 2 * vl;
+ } while (w > 0);
+}
+
+void MergeUVRow_RVV(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ size_t w = (size_t)width;
+ do {
+ vuint8m4_t v_u, v_v;
+ size_t vl = __riscv_vsetvl_e8m4(w);
+ v_u = __riscv_vle8_v_u8m4(src_u, vl);
+ v_v = __riscv_vle8_v_u8m4(src_v, vl);
+ __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+ w -= vl;
+ src_u += vl;
+ src_v += vl;
+ dst_uv += 2 * vl;
+ } while (w > 0);
+}
+
struct RgbConstants {
uint8_t kRGBToY[4];
uint16_t kAddY;