Add Split/Merge RGB/ARGB/XRGB Row_RVV

* Run on SiFive internal FPGA: SplitRGBPlane_Opt (~6.87x vs scalar) SplitARGBPlane_Opt (~10.77x vs scalar) SplitXRGBPlane_Opt (~18.69x vs scalar) MergeRGBPlane_Opt (~3.63x vs scalar) MergeARGBPlane_Opt (~3.50x vs scalar) MergeXRGBPlane_Opt (~2.90x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 - include a fix to avoid implict conversion warning between size_t & int. Bug: libyuv:956 Change-Id: Icd79b282b04ea3981e7fd4e6d547da6708d82516 Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4443411 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
author: Darren Hsieh <darren.hsieh@sifive.com> 2023-04-11 00:05:48 -0700
committer: libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> 2023-04-28 18:34:46 +0000
commit: 1b3c4c12d4b7972b6656438a37949309bfb2c18a (patch)
tree: 78a2d5ad44167b9dc5a7328b9b53b5e6218a5b74 /source
parent: 7c6a7e5737ec0afa12f132e8d1831d5ffd9ad623 (diff)
download: libyuv-1b3c4c12d4b7972b6656438a37949309bfb2c18a.tar.gz
2 files changed, 213 insertions, 33 deletions
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index b5a2e1a0..c6f9d5c7 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1268,6 +1268,11 @@ void SplitRGBPlane(const uint8_t* src_rgb,
     }
   }
 #endif
+#if defined(HAS_SPLITRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitRGBRow = SplitRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of RGB.
@@ -1327,6 +1332,11 @@ void MergeRGBPlane(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGERGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeRGBRow = MergeRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of RGB.
@@ -1358,6 +1368,9 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
     width *= height;
@@ -1398,6 +1411,11 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitARGBRow = SplitARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
@@ -1425,6 +1443,9 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
                        uint8_t* dst_b, int width) = SplitXRGBRow_C;
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
       dst_stride_g == width && dst_stride_b == width) {
     width *= height;
@@ -1464,6 +1485,11 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SPLITXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitXRGBRow = SplitXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
@@ -1530,6 +1556,9 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       src_stride_a == width && dst_stride_argb == width * 4) {
     width *= height;
@@ -1561,6 +1590,11 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeARGBRow = MergeARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
@@ -1590,6 +1624,9 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r,
 
   assert(height > 0);
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
       dst_stride_argb == width * 4) {
     width *= height;
@@ -1620,6 +1657,11 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r,
     }
   }
 #endif
+#if defined(HAS_MERGEXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeXRGBRow = MergeXRGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index bd21d44e..0ca4740b 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -99,85 +99,223 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
 }
 
 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  size_t vl = __riscv_vsetvl_e8m2(width);
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     vuint8m2_t v_b, v_g, v_r;
     __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
-    width -= vl;
-    src_raw += (3 * vl);
-    dst_argb += (4 * vl);
-    vl = __riscv_vsetvl_e8m2(width);
-  } while (width > 0);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
 }
 
 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  size_t vl = __riscv_vsetvl_e8m2(width);
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     vuint8m2_t v_b, v_g, v_r;
     __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
     __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
-    width -= vl;
-    src_raw += (3 * vl);
-    dst_rgba += (4 * vl);
-    vl = __riscv_vsetvl_e8m2(width);
-  } while (width > 0);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgba += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
 }
 
 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  size_t w = (size_t)width;
   do {
     vuint8m2_t v_b, v_g, v_r;
-    size_t vl = __riscv_vsetvl_e8m2(width);
+    size_t vl = __riscv_vsetvl_e8m2(w);
     __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
-    width -= vl;
-    src_raw += (3 * vl);
-    dst_rgb24 += (3 * vl);
-  } while (width > 0);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
 }
 
 void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  size_t w = (size_t)width;
   do {
     vuint8m2_t v_b, v_g, v_r, v_a;
-    size_t vl = __riscv_vsetvl_e8m2(width);
+    size_t vl = __riscv_vsetvl_e8m2(w);
     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
     __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
-    width -= vl;
-    src_argb += (4 * vl);
-    dst_raw += (3 * vl);
-  } while (width > 0);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_raw += vl * 3;
+  } while (w > 0);
 }
 
 void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
+  size_t w = (size_t)width;
   do {
     vuint8m2_t v_b, v_g, v_r, v_a;
-    size_t vl = __riscv_vsetvl_e8m2(width);
+    size_t vl = __riscv_vsetvl_e8m2(w);
     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
-    width -= vl;
-    src_argb += (4 * vl);
-    dst_rgb24 += (3 * vl);
-  } while (width > 0);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
 }
 
 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
-  size_t vl = __riscv_vsetvl_e8m2(width);
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     vuint8m2_t v_b, v_g, v_r;
     __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
-    width -= vl;
-    src_rgb24 += (3 * vl);
-    dst_argb += (4 * vl);
-    vl = __riscv_vsetvl_e8m2(width);
-  } while (width > 0);
+    w -= vl;
+    src_rgb24 += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_rgb += vl * 3;
+  } while (w > 0);
+}
+
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_rgb += vl * 3;
+  } while (w > 0);
+}
+
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_a += vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    src_a += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_r, v_g, v_b;
+    v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
 }
 
 #ifdef __cplusplus
author	Darren Hsieh <darren.hsieh@sifive.com>	2023-04-11 00:05:48 -0700
committer	libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>	2023-04-28 18:34:46 +0000
commit	1b3c4c12d4b7972b6656438a37949309bfb2c18a (patch)
tree	78a2d5ad44167b9dc5a7328b9b53b5e6218a5b74 /source
parent	7c6a7e5737ec0afa12f132e8d1831d5ffd9ad623 (diff)
download	libyuv-1b3c4c12d4b7972b6656438a37949309bfb2c18a.tar.gz