diff options
author | Darren Hsieh <darren.hsieh@sifive.com> | 2023-07-13 00:23:11 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-07-24 17:07:01 +0000 |
commit | aed6dbef176815fb4186dddb73f6d753ef19073a (patch) | |
tree | 6e53841695923e1801b175d204fa239723dcbbfc | |
parent | 9b6895ccd9eb065f30f3f23383a9b182a5f71e27 (diff) | |
download | libyuv-aed6dbef176815fb4186dddb73f6d753ef19073a.tar.gz |
[RVV] Enable NV{12,21}To{ARGB,RGB24}Row_RVV
* Run on SiFive internal FPGA(w/ -march=rv64gcv):
Test Case Speedup
NV12ToARGB_Opt 12.0
NV21ToARGB_Opt 12.1
NV12ToABGR_Opt 12.6
NV21ToABGR_Opt 12.0
NV12ToRGB24_Opt 12.5
NV21ToRGB24_Opt 11.7
NV12ToRAW_Opt 12.1
NV21ToRAW_Opt 11.4
Change-Id: Icae2bac2b4ebbd4c5a89e847fde9a74fe6481878
Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4707804
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 24 | ||||
-rw-r--r-- | source/convert_argb.cc | 20 | ||||
-rw-r--r-- | source/row_rvv.cc | 212 |
3 files changed, 222 insertions, 34 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index eb7a8d06..c0cb251d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -820,6 +820,10 @@ extern "C" { #define HAS_MERGERGBROW_RVV #define HAS_MERGEUVROW_RVV #define HAS_MERGEXRGBROW_RVV +#define HAS_NV12TOARGBROW_RVV +#define HAS_NV12TORGB24ROW_RVV +#define HAS_NV21TOARGBROW_RVV +#define HAS_NV21TORGB24ROW_RVV #define HAS_SPLITARGBROW_RVV #define HAS_SPLITRGBROW_RVV #define HAS_SPLITUVROW_RVV @@ -1349,6 +1353,26 @@ void UYVYToARGBRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index cc6560de..f3a1f913 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3853,6 +3853,11 @@ int NV12ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV12ToARGBRow = NV12ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -3938,6 +3943,11 @@ int NV21ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV21ToARGBRow = NV21ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); @@ -4058,6 +4068,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV12TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV12ToRGB24Row = NV12ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); @@ -4119,6 +4134,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + NV21ToRGB24Row = NV21ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index ad5ccbcd..c8df61db 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -44,35 +44,35 @@ extern "C" { br = yuvconst->kRGBCoeffBias[3] + 32; \ } -// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 -#define READYUV422(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m1_t v_tmp0, v_tmp1; \ - vuint8m2_t v_y; \ - vuint16m2_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422 +#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } -// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 -#define READYUV444(vl, v_u, v_v, v_y_16) \ - { \ - vuint8m2_t v_y; \ - vl = __riscv_vsetvl_e8m2(w); \ - v_y = __riscv_vle8_v_u8m2(src_y, vl); \ - v_u = __riscv_vle8_v_u8m2(src_u, vl); \ - v_v = __riscv_vle8_v_u8m2(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ +// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444 +#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ + { \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Convert from YUV to fixed point RGB @@ -103,6 +103,44 @@ extern "C" { v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { size_t avl = (size_t)4 * width; do { @@ -278,7 +316,7 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -307,7 +345,7 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -337,7 +375,7 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV444(vl, v_u, v_v, v_y_16); + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -366,7 +404,7 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -395,7 +433,7 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); v_a = __riscv_vle8_v_u8m2(src_a, vl); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); @@ -426,7 +464,7 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -454,7 +492,7 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { - READYUV422(vl, v_u, v_v, v_y_16); + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); @@ -533,6 +571,112 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { } while (w > 0); } +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} + // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 void InterpolateRow_RVV(uint8_t* dst_ptr, const uint8_t* src_ptr, |