diff options
Diffstat (limited to 'source/row_rvv.cc')
-rw-r--r-- | source/row_rvv.cc | 247 |
1 files changed, 119 insertions, 128 deletions
diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 7297a401..27e91a3b 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -29,78 +29,76 @@ extern "C" { // Fill YUV -> RGB conversion constants into vectors // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). -#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \ - v_br) \ - { \ - asm volatile("csrwi vxrm, 0"); \ - vl = __riscv_vsetvl_e8m1(w); \ - v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl); \ - v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl); \ - v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl); \ - v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl); \ - v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl); \ - v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1] + 32, vl); \ - v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2] - 32, vl); \ - v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3] + 32, vl); \ +#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ } // Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 #define READYUV422(vl, v_u, v_v, v_y_16) \ { \ - vuint8mf2_t v_tmp0, v_tmp1; \ - vuint8m1_t v_y; \ - vuint16m1_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8mf2((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16); \ - vl = __riscv_vsetvl_e8m1(w); \ - v_y = __riscv_vle8_v_u8m1(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 #define READYUV444(vl, v_u, v_v, v_y_16) \ { \ - vuint8m1_t v_y; \ - vl = __riscv_vsetvl_e8m1(w); \ - v_y = __riscv_vle8_v_u8m1(src_y, vl); \ - v_u = __riscv_vle8_v_u8m1(src_u, vl); \ - v_v = __riscv_vle8_v_u8m1(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Convert from YUV to fixed point RGB -#define YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, \ - v_y_16, v_g_16, v_b_16, v_r_16) \ +#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \ + v_b_16, v_r_16) \ { \ - vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ - vuint32m4_t v_tmp5; \ - v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \ - v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \ - v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \ - v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \ - v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \ - v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \ - v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \ - v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \ - v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \ - v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \ - v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \ - v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \ + vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ + vuint32m8_t v_tmp5; \ + v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \ + v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \ + v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \ + v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \ + v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \ + v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \ + v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \ + v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \ + v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \ + v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \ + v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \ + v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ } // Convert from fixed point RGB To 8 bit RGB #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ { \ - v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl); \ - v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl); \ - v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl); \ + v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \ + v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ + v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -270,20 +268,19 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV444(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_u += vl; @@ -301,20 +298,19 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV444(vl, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m1(src_a, vl); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_a += vl; @@ -332,19 +328,18 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV444(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl; @@ -361,20 +356,19 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_u += vl / 2; @@ -392,20 +386,19 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV422(vl, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m1(src_a, vl); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_a += vl; @@ -423,20 +416,19 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl / 2; @@ -453,19 +445,18 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl / 2; |