aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarren Hsieh <darren.hsieh@sifive.com>2023-05-09 01:39:06 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-05-10 19:50:56 +0000
commit497ea35688f997edb4c42ef1cdd9f2ab4efb9e29 (patch)
treef1b3303372165697ab22583783afb8c2b1525e2e
parent964d963afb164e768919f5bd2284202d87a3d37c (diff)
downloadlibyuv-497ea35688f997edb4c42ef1cdd9f2ab4efb9e29.tar.gz
Enable I444To{ARGB,RGB24}Row_RVV
Run on SiFive internal FPGA: I444ToARGB_Opt (~16x vs scalar) I444ToRGB24_Opt (~10x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 Change-Id: Idae7dc46ef648beaa14b58ba3eb56b67b17c9b3b Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4520761 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--include/libyuv/row.h14
-rw-r--r--source/convert_argb.cc30
-rw-r--r--source/row_rvv.cc116
3 files changed, 137 insertions, 23 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index e8e4ae5b..a8375606 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -785,6 +785,8 @@ extern "C" {
#define HAS_I422TOARGBROW_RVV
#define HAS_I422TORGB24ROW_RVV
#define HAS_I422TORGBAROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
#define HAS_MERGEARGBROW_RVV
#define HAS_MERGERGBROW_RVV
#define HAS_MERGEXRGBROW_RVV
@@ -1062,6 +1064,18 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index b06ece53..62884e5d 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -625,6 +625,11 @@ int I444ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -855,6 +860,11 @@ int I444ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
for (y = 0; y < height; ++y) {
I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -5913,6 +5923,11 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -6047,6 +6062,11 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToARGBRow = I444ToARGBRow_RVV;
+ }
+#endif
#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
@@ -6159,6 +6179,11 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -7625,6 +7650,11 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ I444ToRGB24Row = I444ToRGB24Row_RVV;
+ }
+#endif
#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 475d3e66..39ce71e4 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -65,24 +65,35 @@ extern "C" {
v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
}
+// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444
+#define READYUV444(vl, v_u, v_v, v_y_16) \
+ { \
+ vuint8m1_t v_y; \
+ vl = __riscv_vsetvl_e8m1(w); \
+ v_y = __riscv_vle8_v_u8m1(src_y, vl); \
+ v_u = __riscv_vle8_v_u8m1(src_u, vl); \
+ v_v = __riscv_vle8_v_u8m1(src_v, vl); \
+ v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
+ }
+
// Convert from YUV to fixed point RGB
-#define YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, \
- v_g_16, v_b_16, v_r_16) \
- { \
- vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
- vuint32m4_t v_tmp5; \
- v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \
- v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \
- v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \
- v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \
- v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \
- v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \
- v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \
- v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \
- v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \
- v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \
- v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \
- v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \
+#define YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, \
+ v_y_16, v_g_16, v_b_16, v_r_16) \
+ { \
+ vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
+ vuint32m4_t v_tmp5; \
+ v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \
+ v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \
+ v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \
+ v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \
+ v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \
+ v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \
+ v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \
+ v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \
+ v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \
+ v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \
+ v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \
+ v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \
}
// Convert from fixed point RGB To 8 bit RGB
@@ -252,6 +263,65 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
} while (w > 0);
}
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ vuint8m1_t v_u, v_v;
+ vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+ vuint8m1_t v_b, v_g, v_r, v_a;
+ vuint16m2_t v_yg, v_bb, v_bg, v_br;
+ vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+ v_br);
+ v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+ do {
+ READYUV444(vl, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
+ v_y_16, v_g_16, v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl;
+ src_v += vl;
+ dst_argb += vl * 4;
+ } while (w > 0);
+}
+
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ size_t vl;
+ size_t w = (size_t)width;
+ vuint8m1_t v_u, v_v;
+ vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+ vuint8m1_t v_b, v_g, v_r;
+ vuint16m2_t v_yg, v_bb, v_bg, v_br;
+ vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+ YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+ v_br);
+ do {
+ READYUV444(vl, v_u, v_v, v_y_16);
+ YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
+ v_y_16, v_g_16, v_b_16, v_r_16);
+ RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+ __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
+ w -= vl;
+ src_y += vl;
+ src_u += vl;
+ src_v += vl;
+ dst_rgb24 += vl * 3;
+ } while (w > 0);
+}
+
void I422ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -270,8 +340,8 @@ void I422ToARGBRow_RVV(const uint8_t* src_y,
v_a = __riscv_vmv_v_x_u8m1(255u, vl);
do {
READYUV422(vl, v_u, v_v, v_y_16);
- YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
- v_b_16, v_r_16);
+ YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
+ v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
w -= vl;
@@ -300,8 +370,8 @@ void I422ToRGBARow_RVV(const uint8_t* src_y,
v_a = __riscv_vmv_v_x_u8m1(255u, vl);
do {
READYUV422(vl, v_u, v_v, v_y_16);
- YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
- v_b_16, v_r_16);
+ YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
+ v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl);
w -= vl;
@@ -329,8 +399,8 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y,
v_br);
do {
READYUV422(vl, v_u, v_v, v_y_16);
- YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
- v_b_16, v_r_16);
+ YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
+ v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
w -= vl;