diff options
author | Darren Hsieh <darren.hsieh@sifive.com> | 2023-05-02 00:33:27 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-10 00:29:20 +0000 |
commit | 964d963afb164e768919f5bd2284202d87a3d37c (patch) | |
tree | ce76e4d8dc4464791f42d1de762a97229da47e99 | |
parent | 1d940cc570212c8979d81e78738296fe39f9df43 (diff) | |
download | libyuv-964d963afb164e768919f5bd2284202d87a3d37c.tar.gz |
Enable I422To{ARGB,RGBA,RGB24}Row_RVV
Run on SiFive internal FPGA:
I422ToARGB_Opt (~10x vs scalar)
I422ToRGBA_Opt (~10x vs scalar)
I420ToRGB24_Opt (~8x vs scalar)
LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10
This CL manually sets rounding mode,
since we use fixed-point vector narrowing clip.
There is no definition about default value for fixed-point rounding mode.
https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#38-vector-fixed-point-rounding-mode-register-vxrm
The behavior could be different on differet paltforms. To avoid unexpected behavior, we set rounding mode manually.
Change-Id: I90f0dcb90c37f7da7caab8eb1df6c9c7a3c874a8
Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4512373
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 25 | ||||
-rw-r--r-- | source/convert_argb.cc | 35 | ||||
-rw-r--r-- | source/row_common.cc | 6 | ||||
-rw-r--r-- | source/row_rvv.cc | 155 | ||||
-rw-r--r-- | source/scale_argb.cc | 5 |
5 files changed, 221 insertions, 5 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9a9d1b38..e8e4ae5b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -782,6 +782,9 @@ extern "C" { #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORGB24ROW_RVV +#define HAS_I422TOARGBROW_RVV +#define HAS_I422TORGB24ROW_RVV +#define HAS_I422TORGBAROW_RVV #define HAS_MERGEARGBROW_RVV #define HAS_MERGERGBROW_RVV #define HAS_MERGEXRGBROW_RVV @@ -853,8 +856,8 @@ typedef uint32_t ulvec32[8]; typedef uint8_t ulvec8[32]; #endif -#if defined(__aarch64__) || defined(__arm__) -// This struct is for ARM color conversion. +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +// This struct is for ARM and RISC-V color conversion. struct YuvConstants { uvec8 kUVCoeff; vec16 kRGBCoeffBias; @@ -1059,6 +1062,24 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 691208fd..b06ece53 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -136,6 +136,11 @@ int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -385,6 +390,11 @@ int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -4511,6 +4521,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -4734,6 +4749,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -4859,6 +4879,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -5056,6 +5081,11 @@ int I422ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -5620,6 +5650,11 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; diff --git a/source/row_common.cc b/source/row_common.cc index 74c9be87..b7655b7b 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1484,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // clang-format off -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) // Bias values include subtract 128 from U and V, bias from Y and rounding. // For B and R bias is negative. For G bias is positive. #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ @@ -1680,7 +1680,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) #undef MAKEYUVCONSTANTS -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) #define LOAD_YUV_CONSTANTS \ int ub = yuvconstants->kUVCoeff[0]; \ int vr = yuvconstants->kUVCoeff[1]; \ @@ -1868,7 +1868,7 @@ static __inline void YPixel(uint8_t y, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) int yg = yuvconstants->kRGBCoeffBias[0]; int ygb = yuvconstants->kRGBCoeffBias[4]; #else diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 956ed9f9..475d3e66 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -27,6 +27,72 @@ namespace libyuv { extern "C" { #endif +// Fill YUV -> RGB conversion constants into vectors +// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode +// register) is set to round-down mode(2). +#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \ + v_br) \ + { \ + asm volatile("csrwi vxrm, 2"); \ + vl = __riscv_vsetvl_e8m1(w); \ + v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl); \ + v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl); \ + v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl); \ + v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl); \ + v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl); \ + v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1], vl); \ + v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2], vl); \ + v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3], vl); \ + } + +// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 +#define READYUV422(vl, v_u, v_v, v_y_16) \ + { \ + vuint8mf2_t v_tmp0, v_tmp1; \ + vuint8m1_t v_y; \ + vuint16m1_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8mf2((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16); \ + vl = __riscv_vsetvl_e8m1(w); \ + v_y = __riscv_vle8_v_u8m1(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ + } + +// Convert from YUV to fixed point RGB +#define YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, \ + v_g_16, v_b_16, v_r_16) \ + { \ + vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ + vuint32m4_t v_tmp5; \ + v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \ + v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \ + v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \ + v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \ + v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \ + v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \ + v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \ + v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \ + v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \ + v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \ + v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \ + v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \ + } + +// Convert from fixed point RGB To 8 bit RGB +#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ + { \ + v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl); \ + v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl); \ + v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl); \ + } + void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { size_t avl = (size_t)4 * width; do { @@ -186,6 +252,95 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, } while (w > 0); } +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + vuint8m1_t v_u, v_v; + vuint8m1_t v_ub, v_vr, v_ug, v_vg; + vuint8m1_t v_b, v_g, v_r, v_a; + vuint16m2_t v_yg, v_bb, v_bg, v_br; + vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, + v_br); + v_a = __riscv_vmv_v_x_u8m1(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + vuint8m1_t v_u, v_v; + vuint8m1_t v_ub, v_vr, v_ug, v_vg; + vuint8m1_t v_b, v_g, v_r, v_a; + vuint16m2_t v_yg, v_bb, v_bg, v_br; + vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, + v_br); + v_a = __riscv_vmv_v_x_u8m1(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} + +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + vuint8m1_t v_u, v_v; + vuint8m1_t v_ub, v_vr, v_ug, v_vg; + vuint8m1_t v_b, v_g, v_r; + vuint16m2_t v_yg, v_bb, v_bg, v_br; + vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, + v_br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} + void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 7e78cc1b..8d250947 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -675,6 +675,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, |