aboutsummaryrefslogtreecommitdiff
path: root/source/row_rvv.cc
diff options
context:
space:
mode:
authorDarren Hsieh <darren.hsieh@sifive.com>2023-05-02 00:33:27 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-05-10 00:29:20 +0000
commit964d963afb164e768919f5bd2284202d87a3d37c (patch)
treece76e4d8dc4464791f42d1de762a97229da47e99 /source/row_rvv.cc
parent1d940cc570212c8979d81e78738296fe39f9df43 (diff)
downloadlibyuv-964d963afb164e768919f5bd2284202d87a3d37c.tar.gz
Enable I422To{ARGB,RGBA,RGB24}Row_RVV
Run on SiFive internal FPGA: I422ToARGB_Opt (~10x vs scalar) I422ToRGBA_Opt (~10x vs scalar) I420ToRGB24_Opt (~8x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 This CL manually sets rounding mode, since we use fixed-point vector narrowing clip. There is no definition about default value for fixed-point rounding mode. https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#38-vector-fixed-point-rounding-mode-register-vxrm The behavior could be different on differet paltforms. To avoid unexpected behavior, we set rounding mode manually. Change-Id: I90f0dcb90c37f7da7caab8eb1df6c9c7a3c874a8 Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4512373 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_rvv.cc')
-rw-r--r--source/row_rvv.cc155
1 files changed, 155 insertions, 0 deletions
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 956ed9f9..475d3e66 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -27,6 +27,72 @@ namespace libyuv {
extern "C" {
#endif
+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-down mode(2).
+#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \
+ v_br) \
+ { \
+ asm volatile("csrwi vxrm, 2"); \
+ vl = __riscv_vsetvl_e8m1(w); \
+ v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl); \
+ v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl); \
+ v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl); \
+ v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl); \
+ v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl); \
+ v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1], vl); \
+ v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2], vl); \
+ v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3], vl); \
+ }
+
+// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422
+#define READYUV422(vl, v_u, v_v, v_y_16) \
+ { \
+ vuint8mf2_t v_tmp0, v_tmp1; \
+ vuint8m1_t v_y; \
+ vuint16m1_t v_u_16, v_v_16; \
+ vl = __riscv_vsetvl_e8mf2((w + 1) / 2); \
+ v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl); \
+ v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl); \
+ v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl); \
+ v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl); \
+ v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \
+ v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \
+ v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16); \
+ v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16); \
+ vl = __riscv_vsetvl_e8m1(w); \
+ v_y = __riscv_vle8_v_u8m1(src_y, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
+ }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, \
+ v_g_16, v_b_16, v_r_16) \
+ { \
+ vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
+ vuint32m4_t v_tmp5; \
+ v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \
+ v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \
+ v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \
+ v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \
+ v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \
+ v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \
+ v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \
+ v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \
+ v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \
+ v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \
+ v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \
+ v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \
+ }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+ { \
+ v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl); \
+ v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl); \
+ v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl); \
+ }
+
void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
size_t avl = (size_t)4 * width;
do {
@@ -186,6 +252,95 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
} while (w > 0);
}
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ vuint8m1_t v_u, v_v;
+ vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+ vuint8m1_t v_b, v_g, v_r, v_a;
+ vuint16m2_t v_yg, v_bb, v_bg, v_br;
+ vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+ v_br);
+ v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+ do {
+ READYUV422(vl, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ vuint8m1_t v_u, v_v;
+ vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+ vuint8m1_t v_b, v_g, v_r, v_a;
+ vuint16m2_t v_yg, v_bb, v_bg, v_br;
+ vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+ v_br);
+ v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+ do {
+ READYUV422(vl, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_rgba += vl * 4;
+ } while (w > 0);
+}
+
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ vuint8m1_t v_u, v_v;
+ vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+ vuint8m1_t v_b, v_g, v_r;
+ vuint16m2_t v_yg, v_bb, v_bg, v_br;
+ vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+ v_br);
+ do {
+ READYUV422(vl, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+ v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl / 2;
+ src_v += vl / 2;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+
void SplitRGBRow_RVV(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,