diff options
author | Bruce Lai <bruce.lai@sifive.com> | 2023-04-25 02:08:45 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-11 10:17:51 +0000 |
commit | 59eae49f17c345aa4949271dea2bb088130793af (patch) | |
tree | a89af6dad716978b1f9ad030a37a551807f9cad2 | |
parent | 497ea35688f997edb4c42ef1cdd9f2ab4efb9e29 (diff) | |
download | libyuv-59eae49f17c345aa4949271dea2bb088130793af.tar.gz |
Enable ARGBToYMatrixRow_RVV/RGBAToYMatrixRow_RVV/RGBToYMatrixRow_RVV
Run on SiFive internal FPGA:
ARGBToJ400_Opt (~6x vs scalar)
RGBAToJ400_Opt (~6x vs scalar)
RGB24ToJ400_Opt (~5.5x vs scalar)
LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10
Change-Id: Ia3ce8cea7962fbd8618cc23e850a7913c9cabf4f
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4521783
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 28 | ||||
-rw-r--r-- | source/convert.cc | 55 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 87 | ||||
-rw-r--r-- | source/planar_functions.cc | 5 | ||||
-rw-r--r-- | source/row_rvv.cc | 167 |
5 files changed, 334 insertions, 8 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a8375606..680982e7 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -782,6 +782,11 @@ extern "C" { #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORGB24ROW_RVV +#define HAS_ARGBTOYROW_RVV +#define HAS_ARGBTOYJROW_RVV +#define HAS_ABGRTOYROW_RVV +#define HAS_ABGRTOYJROW_RVV +#define HAS_BGRATOYROW_RVV #define HAS_I422TOARGBROW_RVV #define HAS_I422TORGB24ROW_RVV #define HAS_I422TORGBAROW_RVV @@ -790,13 +795,19 @@ extern "C" { #define HAS_MERGEARGBROW_RVV #define HAS_MERGERGBROW_RVV #define HAS_MERGEXRGBROW_RVV -#define HAS_SPLITARGBROW_RVV -#define HAS_SPLITRGBROW_RVV -#define HAS_SPLITXRGBROW_RVV #define HAS_RAWTOARGBROW_RVV #define HAS_RAWTORGB24ROW_RVV #define HAS_RAWTORGBAROW_RVV +#define HAS_RAWTOYJROW_RVV +#define HAS_RAWTOYROW_RVV #define HAS_RGB24TOARGBROW_RVV +#define HAS_RGB24TOYJROW_RVV +#define HAS_RGB24TOYROW_RVV +#define HAS_RGBATOYROW_RVV +#define HAS_RGBATOYJROW_RVV +#define HAS_SPLITARGBROW_RVV +#define HAS_SPLITRGBROW_RVV +#define HAS_SPLITXRGBROW_RVV #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1327,6 +1338,10 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width); +void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); +void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width); @@ -1548,6 +1563,13 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width); void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); diff --git a/source/convert.cc b/source/convert.cc index ad679c59..140be1ab 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1950,6 +1950,11 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BGRAToYRow = BGRAToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); @@ -2070,6 +2075,11 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -2174,6 +2184,11 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYRow = RGBAToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -2193,7 +2208,7 @@ int RGBAToI420(const uint8_t* src_rgba, // Enabled if 1 pass is available #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_LSX)) + defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV)) #define HAS_RGB24TOYROW #endif @@ -2278,6 +2293,11 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYRow = RGB24ToYRow_RVV; + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYROW @@ -2367,7 +2387,8 @@ int RGB24ToI420(const uint8_t* src_rgb24, #undef HAS_RGB24TOYROW // Enabled if 1 pass is available -#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) +#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_RVV) #define HAS_RGB24TOYJROW #endif @@ -2448,6 +2469,11 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYJROW @@ -2538,7 +2564,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Enabled if 1 pass is available #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_LSX)) + defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV)) #define HAS_RAWTOYROW #endif @@ -2622,6 +2648,11 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYRow = RAWToYRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYROW @@ -2711,7 +2742,8 @@ int RAWToI420(const uint8_t* src_raw, #undef HAS_RAWTOYROW // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ + defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -2792,6 +2824,11 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW @@ -3515,6 +3552,11 @@ int RGB24ToJ400(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RGB24ToYJRow(src_rgb24, dst_yj, width); @@ -3599,6 +3641,11 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RAWToYJRow(src_raw, dst_yj, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 7e6d8647..897b2c11 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -132,6 +132,11 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -256,6 +261,11 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); @@ -373,6 +383,11 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -551,6 +566,11 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -726,6 +746,11 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -902,6 +927,11 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -1085,6 +1115,11 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -1264,6 +1299,11 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -1405,6 +1445,11 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -2104,6 +2149,11 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width); @@ -2235,6 +2285,11 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); @@ -2304,6 +2359,11 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); @@ -2386,6 +2446,11 @@ int RGBAToJ400(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYJRow = RGBAToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RGBAToYJRow(src_rgba, dst_yj, width); @@ -2496,6 +2561,11 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width); @@ -2623,6 +2693,11 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); @@ -2708,6 +2783,11 @@ int ABGRToJ400(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ABGRToYJRow(src_abgr, dst_yj, width); @@ -2846,7 +2926,7 @@ int ARGBToAB64(const uint8_t* src_argb, } // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -2928,6 +3008,11 @@ int RAWToJNV21(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b0dc2f43..72ede82b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4709,6 +4709,11 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 39ce71e4..99f23165 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -543,6 +543,173 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, } while (w > 0); } +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv |