[RVV] Enable CopyRow_RVV, InterpolateRow_RVV, {Merge,Split}UVRow_RVV

* Run on SiFive internal FPGA: MergeUVPlane_Opt(~6x vs scalar) SplitUVPlane_Opt(~6x vs scalar) TestCopyPlane(~8x vs scalar) ARGBInterpolate0_Opt(~10x vs scalar) ARGBInterpolate64_Opt(~9x vs scalar) ARGBInterpolate168_Opt(~9x vs scalar) ARGBInterpolate192_Opt(~8.5x vs scalar) ARGBInterpolate255_Opt(~8x vs scalar) Bug: libyuv:956 Change-Id: I8372341865f75f42e30371ef943d5c2e4be7b79a Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4574186 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
author: Darren Hsieh <darren.hsieh@sifive.com> 2023-05-23 02:03:37 -0700
committer: libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> 2023-05-30 09:10:35 +0000
commit: d14bd701c83375a982fe9e237a71cc06bef056e7 (patch)
tree: 0ceb4ba8b400be8576df3aceb1afcc4dba0df919
parent: 78d168054b9b5c98eb0e9e9a95a09f252ea95be3 (diff)
download: libyuv-d14bd701c83375a982fe9e237a71cc06bef056e7.tar.gz
11 files changed, 241 insertions, 3 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 3488d256..3590f125 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -807,6 +807,7 @@ extern "C" {
 #define HAS_ABGRTOYROW_RVV
 #define HAS_ABGRTOYJROW_RVV
 #define HAS_BGRATOYROW_RVV
+#define HAS_COPYROW_RVV
 #define HAS_I400TOARGBROW_RVV
 #define HAS_I422ALPHATOARGBROW_RVV
 #define HAS_I422TOARGBROW_RVV
@@ -815,12 +816,15 @@ extern "C" {
 #define HAS_I444ALPHATOARGBROW_RVV
 #define HAS_I444TOARGBROW_RVV
 #define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
 #define HAS_J400TOARGBROW_RVV
 #define HAS_MERGEARGBROW_RVV
 #define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
 #define HAS_MERGEXRGBROW_RVV
 #define HAS_SPLITARGBROW_RVV
 #define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
 #define HAS_SPLITXRGBROW_RVV
 #define HAS_RAWTOARGBROW_RVV
 #define HAS_RAWTORGB24ROW_RVV
@@ -832,9 +836,6 @@ extern "C" {
 #define HAS_RGB24TOYROW_RVV
 #define HAS_RGBATOYROW_RVV
 #define HAS_RGBATOYJROW_RVV
-#define HAS_SPLITARGBROW_RVV
-#define HAS_SPLITRGBROW_RVV
-#define HAS_SPLITXRGBROW_RVV
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -2242,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -2403,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -3038,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -5858,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
diff --git a/source/convert.cc b/source/convert.cc
index 075428d0..9763d2fe 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -955,6 +955,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -995,6 +1000,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 6e05876a..b192a3b7 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -454,6 +454,11 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -647,6 +652,11 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -827,6 +837,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -1008,6 +1023,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -3204,6 +3224,11 @@ int RAWToJNV21(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a row of uv.
     align_buffer_64(row_uj, ((halfwidth + 31) & ~31) * 2);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e741dc50..d115a2a1 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -545,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
@@ -631,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -4348,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -5560,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
+
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -5600,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -5665,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u,
     HalfMergeUVRow = HalfMergeUVRow_AVX2;
   }
 #endif
+
   for (y = 0; y < height - 1; y += 2) {
     // Merge a row of U and V into a row of UV.
     HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
diff --git a/source/rotate.cc b/source/rotate.cc
index 6797ff02..8d3978c7 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -214,6 +214,11 @@ void RotatePlane180(const uint8_t* src,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index 9667f34c..c7239010 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -192,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index be4c4a30..7297a401 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -528,6 +528,75 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   } while (w > 0);
 }
 
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vse8_v_u8m8(dst, v_data, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      // Averaging add
+      vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+
 void SplitRGBRow_RVV(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
@@ -660,6 +729,42 @@ void MergeXRGBRow_RVV(const uint8_t* src_r,
   } while (w > 0);
 }
 
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u, v_v;
+    __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m4_t v_u, v_v;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
diff --git a/source/scale.cc b/source/scale.cc
index 591a6a93..80b030dc 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1118,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1313,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 8d250947..ddd8d29e 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -468,6 +473,11 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
   if (src_width >= 32768) {
     ScaleARGBFilterCols =
         filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
@@ -724,6 +734,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 5e603fd4..77455903 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1678,6 +1678,12 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 65f986e9..1556071d 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
@@ -511,6 +516,11 @@ static void ScaleUVBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
   if (src_width >= 32768) {
     ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
   }
author	Darren Hsieh <darren.hsieh@sifive.com>	2023-05-23 02:03:37 -0700
committer	libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>	2023-05-30 09:10:35 +0000
commit	d14bd701c83375a982fe9e237a71cc06bef056e7 (patch)
tree	0ceb4ba8b400be8576df3aceb1afcc4dba0df919
parent	78d168054b9b5c98eb0e9e9a95a09f252ea95be3 (diff)
download	libyuv-d14bd701c83375a982fe9e237a71cc06bef056e7.tar.gz