17 files changed, 2108 insertions, 510 deletions
diff --git a/source/convert.cc b/source/convert.cc
index b11ab1bf..b68fb1d3 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -2128,6 +2128,11 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
                                                 : ARGBExtractAlphaRow_Any_LSX;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index cc6560de..f6ab0784 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -3853,6 +3853,11 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToARGBRow = NV12ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -3938,6 +3943,11 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToARGBRow = NV21ToARGBRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
@@ -4058,6 +4068,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV12TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
@@ -4119,6 +4134,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_NV21TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
@@ -6020,6 +6040,12 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 4 lines temp
   const int row_size = (width + 31) & ~31;
@@ -6151,6 +6177,11 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 2 lines temp
   const int row_size = (width + 31) & ~31;
@@ -6276,6 +6307,12 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 4 lines temp
   const int row_size = (width + 31) & ~31;
@@ -6837,6 +6874,12 @@ static int I420AlphaToARGBMatrixBilinear(
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 4 lines temp
   const int row_size = (width + 31) & ~31;
@@ -7032,6 +7075,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 2 lines temp
   const int row_size = (width + 31) & ~31;
@@ -7770,6 +7818,11 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
     ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   // alloc 2 lines temp
   const int row_size = (width + 31) & ~31;
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index d115a2a1..f6ec0dac 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2783,37 +2783,6 @@ int RGB24Mirror(const uint8_t* src_rgb24,
   return 0;
 }
 
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
-    return ARGBBlendRow;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBBlendRow = ARGBBlendRow_NEON;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBBlendRow = ARGBBlendRow_MSA;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBBlendRow = ARGBBlendRow_LSX;
-  }
-#endif
-  return ARGBBlendRow;
-}
-
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
 int ARGBBlend(const uint8_t* src_argb0,
@@ -2826,7 +2795,7 @@ int ARGBBlend(const uint8_t* src_argb0,
               int height) {
   int y;
   void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = GetARGBBlend();
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2843,7 +2812,31 @@ int ARGBBlend(const uint8_t* src_argb0,
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBBlendRow = ARGBBlendRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBBlendRow = ARGBBlendRow_RVV;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
     src_argb0 += src_stride_argb0;
@@ -2903,6 +2896,11 @@ int BlendPlane(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -2980,6 +2978,11 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
+  }
+#endif
   if (!IS_ALIGNED(width, 2)) {
     ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
   }
@@ -3016,6 +3019,11 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = ScaleRowDown2Box_RVV;
+  }
+#endif
 
   // Row buffer for intermediate alpha pixels.
   align_buffer_64(halfalpha, halfwidth);
@@ -5340,6 +5348,11 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
                                                 : ARGBExtractAlphaRow_Any_LSX;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBExtractAlphaRow(src_argb, dst_a, width);
@@ -5391,6 +5404,11 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBCopyYToAlphaRow(src_y, dst_argb, width);
diff --git a/source/rotate.cc b/source/rotate.cc
index 8d3978c7..3678b80a 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -489,13 +489,12 @@ int RotatePlane(const uint8_t* src,
   return -1;
 }
 
-LIBYUV_API
-void TransposePlane_16(const uint16_t* src,
-                       int src_stride,
-                       uint16_t* dst,
-                       int dst_stride,
-                       int width,
-                       int height) {
+static void TransposePlane_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
   int i = height;
   // Work across the source in 8x8 tiles
   while (i >= 8) {
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index c7239010..034d53e8 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -69,6 +69,11 @@ static int ARGBTranspose(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
     ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
index 4b496d1b..e72608e9 100644
--- a/source/rotate_common.cc
+++ b/source/rotate_common.cc
@@ -120,37 +120,6 @@ void TransposeWx8_16_C(const uint16_t* src,
   }
 }
 
-void TransposeUVWx8_16_C(const uint16_t* src,
-                         int src_stride,
-                         uint16_t* dst_a,
-                         int dst_stride_a,
-                         uint16_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
 void TransposeWxH_16_C(const uint16_t* src,
                        int src_stride,
                        uint16_t* dst,
diff --git a/source/row_common.cc b/source/row_common.cc
index 8be37fb5..7591c6b6 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -48,7 +48,6 @@ extern "C" {
                                    defined(__i386__) || defined(_M_IX86))
 #define LIBYUV_ARGBTOUV_PAVGB 1
 #define LIBYUV_RGBTOU_TRUNCATE 1
-#define LIBYUV_ATTENUATE_DUP 1
 #endif
 #if defined(LIBYUV_BIT_EXACT)
 #define LIBYUV_UNATTENUATE_DUP 1
@@ -1876,9 +1875,10 @@ static __inline void YPixel(uint8_t y,
   int yg = yuvconstants->kYToRgb[0];
 #endif
   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
-  *g = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
-  *r = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  uint8_t b8 = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  *b = b8;
+  *g = b8;
+  *r = b8;
 }
 
 void I444ToARGBRow_C(const uint8_t* src_y,
@@ -3369,12 +3369,7 @@ void BlendPlaneRow_C(const uint8_t* src0,
 }
 #undef UBLEND
 
-#if LIBYUV_ATTENUATE_DUP
-// This code mimics the SSSE3 version for better testability.
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
-#else
-#define ATTENUATE(f, a) (f * a + 128) >> 8
-#endif
+#define ATTENUATE(f, a) (f * a + 255) >> 8
 
 // Multiply source RGB by alpha and store to destination.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index e94fd04d..d8074987 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -7441,93 +7441,106 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
-                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
+                                       -128, -128, 14,   -128, 14, -128,
+                                       14,   -128, -128, -128};
+
 // Attenuate 4 pixels at a time.
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
-      "pslld       $0x18,%%xmm3                  \n"
       "movdqa      %3,%%xmm4                     \n"
-      "movdqa      %4,%%xmm5                     \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "punpcklbw   %%xmm6,%%xmm7                 \n"
+      "sub         %0,%1                         \n"
 
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "pshufb      %%xmm4,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "punpcklbw   %%xmm1,%%xmm1                 \n"
-      "pmulhuw     %%xmm1,%%xmm0                 \n"
-      "movdqu      (%0),%%xmm1                   \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "punpckhbw   %%xmm2,%%xmm2                 \n"
-      "pmulhuw     %%xmm2,%%xmm1                 \n"
-      "movdqu      (%0),%%xmm2                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "pand        %%xmm3,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqa      %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"  // + 255
+      "paddw       %%xmm7,%%xmm1                 \n"
       "psrlw       $0x8,%%xmm0                   \n"
       "psrlw       $0x8,%%xmm1                   \n"
       "packuswb    %%xmm1,%%xmm0                 \n"
-      "por         %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
+      "pand        %%xmm5,%%xmm6                 \n"
+      "por         %%xmm6,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0,%1)                \n"
+      "lea         0x10(%0),%0                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),       // %0
-        "+r"(dst_argb),       // %1
-        "+r"(width)           // %2
-      : "m"(kShuffleAlpha0),  // %3
-        "m"(kShuffleAlpha1)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+      : "+r"(src_argb),         // %0
+        "+r"(dst_argb),         // %1
+        "+r"(width)             // %2
+      : "m"(kAttenuateShuffle)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
+
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
+static const lvec8 kAttenuateShuffle_AVX2 = {
+    6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
+    -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
+    -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
+
 // Attenuate 8 pixels at a time.
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vmovdqa     %3,%%ymm4                     \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
+      "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
       "sub         %0,%1                         \n"
 
       // 8 pixel loop.
       LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm6                   \n"
-      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
-      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
+      "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
       "lea         0x20(%0),%0                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAlpha_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(width)                  // %2
+      : "m"(kAttenuateShuffle_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 4ed13638..31142a90 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -1827,19 +1827,27 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vmov.u8     d24, #112                     \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.u8     d25, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.u8     d26, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.u8     d27, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.u8     d28, #94                      \n"  // VG -0.7344 coefficient
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+
+      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
+      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
+      "vdup.u8     d25, d0[1]                    \n"  // UG -0.5781 coefficient
+      "vdup.u8     d26, d0[2]                    \n"  // UR -0.2969 coefficient
+      "vdup.u8     d27, d0[4]                    \n"  // VB -0.1406 coefficient
+      "vdup.u8     d28, d0[5]                    \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
+
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
@@ -1857,15 +1865,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
         "q15");
 }
 
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@@ -2702,7 +2748,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
-  uint16_t pad;
 };
 
 // RGB to JPeg coefficients
@@ -2710,11 +2755,9 @@ struct RgbConstants {
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
 
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2723,12 +2766,9 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 // Add 16.5 = 0x1080
 
 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
+                                                        0x1080};
 
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
@@ -3058,6 +3098,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
+      "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
+
       // Attenuate 8 pixels.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
@@ -3065,16 +3107,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       "vmull.u8    q10, d0, d3                   \n"  // b * a
       "vmull.u8    q11, d1, d3                   \n"  // g * a
       "vmull.u8    q12, d2, d3                   \n"  // r * a
-      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vaddhn.u16  d0, q10, q15                  \n"  // (b + 255) >> 8
+      "vaddhn.u16  d1, q11, q15                  \n"  // (g + 255) >> 8
+      "vaddhn.u16  d2, q12, q15                  \n"  // (r + 255) >> 8
       "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
       "bgt         1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
-      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 74190d61..1679f87c 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
-      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
-      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
-      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
-      "movi        v29.16b,#0x80                 \n"  // 128.5
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "movi        v29.16b, #0x80                \n"  // 128.5
+
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@@ -2229,15 +2236,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
         "v27", "v28", "v29");
 }
 
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
 #define RGBTOUV_SETUP_REG                                                  \
   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
@@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
-  uint16_t pad;
 };
 
-// RGB to JPeg coefficients
-// B * 0.1140 coefficient = 29
-// G * 0.5870 coefficient = 150
-// R * 0.2990 coefficient = 77
-// Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
-
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
-
-// RGB to BT.601 coefficients
-// B * 0.1016 coefficient = 25
-// G * 0.5078 coefficient = 129
-// R * 0.2578 coefficient = 66
-// Add 16.5 = 0x1080
-
-static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
-
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
-
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_y,
@@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
         "v17");
 }
 
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
 }
@@ -3402,6 +3441,8 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
+      "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
+
       // Attenuate 8 pixels.
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -3410,16 +3451,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
       "prfm        pldl1keep, [%0, 448]          \n"
       "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
       "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
+      "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
+      "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
+      "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
@@ -3960,6 +4001,86 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v1.4h                  \n"  // 8 floats
+      "fcvtl2      v3.4s, v1.8h                  \n"
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+// Read a column and write a row
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width) {
+  asm volatile(
+      "cmp         %w2, #8                       \n"  // Is there 8 rows?
+      "b.lo        2f                            \n"
+      "1:                                        \n"
+      "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
+      "ld1         {v0.h}[1], [%0], %3           \n"
+      "ld1         {v0.h}[2], [%0], %3           \n"
+      "ld1         {v0.h}[3], [%0], %3           \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"
+      "ld1         {v1.h}[1], [%0], %3           \n"
+      "ld1         {v1.h}[2], [%0], %3           \n"
+      "ld1         {v1.h}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
+      "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      "cmp         %w2, #1                       \n"  // Is there 1 value?
+      "b.lo        3f                            \n"
+      "2:                                        \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
+      "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
+      "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
+      "str         s2, [%1], #4                  \n"  // store 1 floats
+      "b.gt        2b                            \n"
+      "3:                                        \n"
+      : "+r"(src),                        // %0
+        "+r"(dst),                        // %1
+        "+r"(width)                       // %2
+      : "r"((ptrdiff_t)(src_stride * 2))  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 fp16 halffloats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "str         q1, [%1], #16                 \n"  // store 8 fp16 halffloats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
 float ScaleMaxSamples_NEON(const float* src,
                            float* dst,
                            float scale,
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 27e91a3b..c875be2f 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -17,7 +17,9 @@
 
 #include "libyuv/row.h"
 
-#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
 #include <assert.h>
 #include <riscv_vector.h>
 
@@ -29,48 +31,48 @@ extern "C" {
 // Fill YUV -> RGB conversion constants into vectors
 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
 // register) is set to round-to-nearest-up mode(0).
-#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
-  {                                                                  \
-    asm volatile("csrwi vxrm, 0");                                   \
-    ub = yuvconst->kUVCoeff[0];                                      \
-    vr = yuvconst->kUVCoeff[1];                                      \
-    ug = yuvconst->kUVCoeff[2];                                      \
-    vg = yuvconst->kUVCoeff[3];                                      \
-    yg = yuvconst->kRGBCoeffBias[0];                                 \
-    bb = yuvconst->kRGBCoeffBias[1] + 32;                            \
-    bg = yuvconst->kRGBCoeffBias[2] - 32;                            \
-    br = yuvconst->kRGBCoeffBias[3] + 32;                            \
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                              \
+    asm volatile("csrwi vxrm, 0");                               \
+    ub = yuvconst->kUVCoeff[0];                                  \
+    vr = yuvconst->kUVCoeff[1];                                  \
+    ug = yuvconst->kUVCoeff[2];                                  \
+    vg = yuvconst->kUVCoeff[3];                                  \
+    yg = yuvconst->kRGBCoeffBias[0];                             \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                        \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                        \
   }
 
-// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422
-#define READYUV422(vl, v_u, v_v, v_y_16)                \
-  {                                                     \
-    vuint8m1_t v_tmp0, v_tmp1;                          \
-    vuint8m2_t v_y;                                     \
-    vuint16m2_t v_u_16, v_v_16;                         \
-    vl = __riscv_vsetvl_e8m1((w + 1) / 2);              \
-    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);            \
-    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);    \
-    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);            \
-    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);    \
-    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
-    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
-    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);    \
-    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);    \
-    vl = __riscv_vsetvl_e8m2(w);                        \
-    v_y = __riscv_vle8_v_u8m2(src_y, vl);               \
-    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);       \
+// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
+#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m1_t v_tmp0, v_tmp1;                                   \
+    vuint8m2_t v_y;                                              \
+    vuint16m2_t v_u_16, v_v_16;                                  \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                       \
+    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);                     \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);             \
+    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);                     \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);             \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);          \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);          \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);             \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);             \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
   }
 
-// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444
-#define READYUV444(vl, v_u, v_v, v_y_16)          \
-  {                                               \
-    vuint8m2_t v_y;                               \
-    vl = __riscv_vsetvl_e8m2(w);                  \
-    v_y = __riscv_vle8_v_u8m2(src_y, vl);         \
-    v_u = __riscv_vle8_v_u8m2(src_u, vl);         \
-    v_v = __riscv_vle8_v_u8m2(src_v, vl);         \
-    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
+// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
+#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m2_t v_y;                                              \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_u = __riscv_vle8_v_u8m2(src_u, vl);                        \
+    v_v = __riscv_vle8_v_u8m2(src_v, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
   }
 
 // Convert from YUV to fixed point RGB
@@ -101,6 +103,45 @@ extern "C" {
     v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
   }
 
+// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
+#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
+#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+#ifdef HAS_ARGBTOAR64ROW_RVV
 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
   size_t avl = (size_t)4 * width;
   do {
@@ -116,7 +157,9 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
     dst_ar64 += vl;
   } while (avl > 0);
 }
+#endif
 
+#ifdef HAS_ARGBTOAB64ROW_RVV
 void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
   size_t avl = (size_t)width;
   do {
@@ -138,7 +181,9 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
     dst_ab64 += 4 * vl;
   } while (avl > 0);
 }
+#endif
 
+#ifdef HAS_AR64TOARGBROW_RVV
 void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
   size_t avl = (size_t)4 * width;
   do {
@@ -153,7 +198,9 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
     dst_argb += vl;
   } while (avl > 0);
 }
+#endif
 
+#ifdef HAS_AB64TOARGBROW_RVV
 void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
   size_t avl = (size_t)width;
   do {
@@ -171,7 +218,9 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
     dst_argb += 4 * vl;
   } while (avl > 0);
 }
+#endif
 
+#ifdef HAS_RAWTOARGBROW_RVV
 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -186,7 +235,9 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
     vl = __riscv_vsetvl_e8m2(w);
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_RAWTORGBAROW_RVV
 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -201,7 +252,9 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
     vl = __riscv_vsetvl_e8m2(w);
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_RAWTORGB24ROW_RVV
 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   size_t w = (size_t)width;
   do {
@@ -214,7 +267,9 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
     dst_rgb24 += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_ARGBTORAWROW_RVV
 void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
   size_t w = (size_t)width;
   do {
@@ -227,7 +282,9 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
     dst_raw += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_ARGBTORGB24ROW_RVV
 void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
@@ -242,7 +299,9 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
     dst_rgb24 += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_RGB24TOARGBROW_RVV
 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
@@ -259,24 +318,26 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
     vl = __riscv_vsetvl_e8m2(w);
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I444TOARGBROW_RVV
 void I444ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  size_t vl;
   size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   uint8_t ub, vr, ug, vg;
   int16_t yg, bb, bg, br;
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r, v_a;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
-    READYUV444(vl, v_u, v_v, v_y_16);
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
@@ -288,7 +349,9 @@ void I444ToARGBRow_RVV(const uint8_t* src_y,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I444ALPHATOARGBROW_RVV
 void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -303,9 +366,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r, v_a;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
-    READYUV444(vl, v_u, v_v, v_y_16);
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     v_a = __riscv_vle8_v_u8m2(src_a, vl);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
@@ -319,7 +382,9 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I444TORGB24ROW_RVV
 void I444ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -333,9 +398,9 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y,
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
-    READYUV444(vl, v_u, v_v, v_y_16);
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
@@ -347,24 +412,26 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y,
     dst_rgb24 += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I422TOARGBROW_RVV
 void I422ToARGBRow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  size_t vl;
   size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   uint8_t ub, vr, ug, vg;
   int16_t yg, bb, bg, br;
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r, v_a;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
-    READYUV422(vl, v_u, v_v, v_y_16);
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
@@ -376,7 +443,9 @@ void I422ToARGBRow_RVV(const uint8_t* src_y,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I422ALPHATOARGBROW_RVV
 void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -391,9 +460,9 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r, v_a;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
-    READYUV422(vl, v_u, v_v, v_y_16);
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     v_a = __riscv_vle8_v_u8m2(src_a, vl);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
@@ -407,24 +476,26 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I422TORGBAROW_RVV
 void I422ToRGBARow_RVV(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  size_t vl;
   size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
   uint8_t ub, vr, ug, vg;
   int16_t yg, bb, bg, br;
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r, v_a;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
-    READYUV422(vl, v_u, v_v, v_y_16);
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
@@ -436,7 +507,9 @@ void I422ToRGBARow_RVV(const uint8_t* src_y,
     dst_rgba += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I422TORGB24ROW_RVV
 void I422ToRGB24Row_RVV(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -450,9 +523,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y,
   vuint8m2_t v_u, v_v;
   vuint8m2_t v_b, v_g, v_r;
   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
-    READYUV422(vl, v_u, v_v, v_y_16);
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
              v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
@@ -464,7 +537,9 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y,
     dst_rgb24 += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_I400TOARGBROW_RVV
 void I400ToARGBRow_RVV(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
@@ -503,7 +578,9 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_J400TOARGBROW_RVV
 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   size_t w = (size_t)width;
   size_t vl = __riscv_vsetvl_e8m2(w);
@@ -518,7 +595,9 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
     vl = __riscv_vsetvl_e8m2(w);
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_COPYROW_RVV
 void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
   size_t w = (size_t)width;
   do {
@@ -530,8 +609,125 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
     dst += vl;
   } while (w > 0);
 }
+#endif
+
+#ifdef HAS_NV12TOARGBROW_RVV
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TORGB24ROW_RVV
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TOARGBROW_RVV
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TORGB24ROW_RVV
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
 
 // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+
+#ifdef HAS_INTERPOLATEROW_RVV
 void InterpolateRow_RVV(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
@@ -554,13 +750,16 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
     } while (dst_w > 0);
     return;
   }
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
   // Blend 50 / 50.
   if (y1_fraction == 128) {
     do {
       size_t vl = __riscv_vsetvl_e8m8(dst_w);
       vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
       vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
-      // Averaging add
+      // Use round-to-nearest-up mode for averaging add
       vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
       __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
       dst_w -= vl;
@@ -571,15 +770,13 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
     return;
   }
   // General purpose row blend.
-  // To match behavior on other platforms, vxrm (fixed-point rounding mode
-  // register) is set to round-to-nearest-up(0).
-  asm volatile("csrwi vxrm, 0");
   do {
     size_t vl = __riscv_vsetvl_e8m4(dst_w);
     vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
     vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
     vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
     acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    // Use round-to-nearest-up mode for vnclip
     __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
     dst_w -= vl;
     src_ptr += vl;
@@ -587,7 +784,9 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
     dst_ptr += vl;
   } while (dst_w > 0);
 }
+#endif
 
+#ifdef HAS_SPLITRGBROW_RVV
 void SplitRGBRow_RVV(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
@@ -608,7 +807,9 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb,
     src_rgb += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_MERGERGBROW_RVV
 void MergeRGBRow_RVV(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
@@ -628,7 +829,9 @@ void MergeRGBRow_RVV(const uint8_t* src_r,
     dst_rgb += vl * 3;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_SPLITARGBROW_RVV
 void SplitARGBRow_RVV(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
@@ -652,7 +855,9 @@ void SplitARGBRow_RVV(const uint8_t* src_argb,
     src_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_MERGEARGBROW_RVV
 void MergeARGBRow_RVV(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
@@ -675,7 +880,9 @@ void MergeARGBRow_RVV(const uint8_t* src_r,
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_SPLITXRGBROW_RVV
 void SplitXRGBRow_RVV(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
@@ -696,7 +903,9 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb,
     src_argb += vl * 4;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_MERGEXRGBROW_RVV
 void MergeXRGBRow_RVV(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
@@ -719,7 +928,9 @@ void MergeXRGBRow_RVV(const uint8_t* src_r,
     vl = __riscv_vsetvl_e8m2(w);
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_SPLITUVROW_RVV
 void SplitUVRow_RVV(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -737,7 +948,9 @@ void SplitUVRow_RVV(const uint8_t* src_uv,
     src_uv += 2 * vl;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_MERGEUVROW_RVV
 void MergeUVRow_RVV(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
@@ -755,6 +968,7 @@ void MergeUVRow_RVV(const uint8_t* src_u,
     dst_uv += 2 * vl;
   } while (w > 0);
 }
+#endif
 
 struct RgbConstants {
   uint8_t kRGBToY[4];
@@ -787,7 +1001,8 @@ static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
                                                       0x1080,
                                                       0};
 
-// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+// ARGB expects first 3 values to contain RGB and 4th value is ignored
+#ifdef HAS_ARGBTOYMATRIXROW_RVV
 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
                           uint8_t* dst_y,
                           int width,
@@ -817,24 +1032,34 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
     dst_y += vl;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_ARGBTOYROW_RVV
 void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
 }
+#endif
 
+#ifdef HAS_ARGBTOYJROW_RVV
 void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
   ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
 }
+#endif
 
+#ifdef HAS_ABGRTOYROW_RVV
 void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
 }
+#endif
 
+#ifdef HAS_ABGRTOYJROW_RVV
 void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
   ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
 }
+#endif
 
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+#ifdef HAS_RGBATOYMATRIXROW_RVV
 void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
                           uint8_t* dst_y,
                           int width,
@@ -864,19 +1089,27 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
     dst_y += vl;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_RGBATOYROW_RVV
 void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
 }
+#endif
 
+#ifdef HAS_RGBATOYJROW_RVV
 void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
   RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
 }
+#endif
 
+#ifdef HAS_BGRATOYROW_RVV
 void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
   RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
 }
+#endif
 
+#ifdef HAS_RGBTOYMATRIXROW_RVV
 void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
                          uint8_t* dst_y,
                          int width,
@@ -906,51 +1139,179 @@ void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
     dst_y += vl;
   } while (w > 0);
 }
+#endif
 
+#ifdef HAS_RGB24TOYJROW_RVV
 void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
   RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
 }
+#endif
 
+#ifdef HAS_RAWTOYJROW_RVV
 void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
   RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
 }
+#endif
 
+#ifdef HAS_RGB24TOYROW_RVV
 void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
   RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
 }
+#endif
 
+#ifdef HAS_RAWTOYROW_RVV
 void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
   RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
 }
+#endif
+
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
+// src_argb: RGB values have already been pre-multiplied by the a.
+#ifdef HAS_ARGBBLENDROW_RVV
+void ARGBBlendRow_RVV(const uint8_t* src_argb,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvlmax_e8m2();
+  // clamp255((((256 - a) * b) >> 8) + f)
+  // = b * (256 - a) / 256 + f
+  // = b - (b * a / 256) + f
+  vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
+  do {
+    vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
+    vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
+    vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
+    vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
+    vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
+                            src_argb, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
+                            src_argb1, vl);
+
+    v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
+    v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
+    v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
+
+    v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
+    v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
+    v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
+
+    v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
+    v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
+    v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
+
+    w -= vl;
+    src_argb += 4 * vl;
+    src_argb1 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_BLENDPLANEROW_RVV
+void BlendPlaneRow_RVV(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint16m8_t v_dst_u16;
+    vuint8m4_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+    vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+    vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
+    vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
+
+    // (a * foreground) + (1-a) * background
+    v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
+    v_dst_u16 =
+        __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
+    v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
+    v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
+
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src0 += vl;
+    src1 += vl;
+    alpha += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
 
+// Attenuate: (f * a + 255) >> 8
+#ifdef HAS_ARGBATTENUATEROW_RVV
 void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
   size_t w = (size_t)width;
-  // To match behavior on other platforms, vxrm (fixed-point rounding mode
-  // register) is set to round-to-nearest-up(0).
-  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m2_t v_b, v_g, v_r, v_a;
     vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
     size_t vl = __riscv_vsetvl_e8m2(w);
     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    // f * a
     v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
     v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
     v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
-    v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl);
-    v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl);
-    v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl);
+    // f * a + 255
+    v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
+    v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
+    v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
+    // (f * a + 255) >> 8
+    v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
     w -= vl;
     src_argb += vl * 4;
     dst_argb += vl * 4;
   } while (w > 0);
 }
+#endif
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_RVV
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_a += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  const ptrdiff_t dst_stride = 4;
+  dst += 3;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
+    w -= vl;
+    src += vl;
+    dst += vl * dst_stride;
+  } while (w > 0);
+}
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/source/scale.cc b/source/scale.cc
index 80b030dc..43d973af 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -135,6 +135,14 @@ static void ScalePlaneDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = filtering == kFilterNone
+                        ? ScaleRowDown2_RVV
+                        : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV
+                                                      : ScaleRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -312,6 +320,11 @@ static void ScalePlaneDown4(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN4_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_RVV : ScaleRowDown4_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -472,6 +485,17 @@ static void ScalePlaneDown34(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_RVV;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -687,6 +711,17 @@ static void ScalePlaneDown38(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN38_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_RVV;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -971,6 +1006,11 @@ static void ScalePlaneBox(int src_width,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_RVV)
+    if (TestCpuFlag(kCpuHasRVV)) {
+      ScaleAddRow = ScaleAddRow_RVV;
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
@@ -1048,15 +1088,15 @@ static void ScalePlaneBox_16(int src_width,
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
+static void ScalePlaneBilinearDown(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1176,15 +1216,15 @@ void ScalePlaneBilinearDown(int src_width,
   free_aligned_buffer_64(row);
 }
 
-void ScalePlaneBilinearDown_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
+static void ScalePlaneBilinearDown_16(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1268,15 +1308,15 @@ void ScalePlaneBilinearDown_16(int src_width,
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          enum FilterMode filtering) {
+static void ScalePlaneBilinearUp(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1425,14 +1465,14 @@ void ScalePlaneBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of I422 to I444.
-void ScalePlaneUp2_Linear(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
+static void ScalePlaneUp2_Linear(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr) {
   void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
       ScaleRowUp2_Linear_Any_C;
   int i;
@@ -1465,6 +1505,11 @@ void ScalePlaneUp2_Linear(int src_width,
     ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
   }
 #endif
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleRowUp2_Linear_RVV;
+  }
+#endif
 
   if (dst_height == 1) {
     ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
@@ -1484,14 +1529,14 @@ void ScalePlaneUp2_Linear(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of I420 to I444.
-void ScalePlaneUp2_Bilinear(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr) {
+static void ScalePlaneUp2_Bilinear(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr) {
   void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                       uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_Any_C;
@@ -1524,6 +1569,11 @@ void ScalePlaneUp2_Bilinear(int src_width,
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
   }
 #endif
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_RVV;
+  }
+#endif
 
   Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
   dst_ptr += dst_stride;
@@ -1544,14 +1594,14 @@ void ScalePlaneUp2_Bilinear(int src_width,
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I210 to I410 and I212 to I412.
-void ScalePlaneUp2_12_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static void ScalePlaneUp2_12_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                      int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   int i;
@@ -1598,14 +1648,14 @@ void ScalePlaneUp2_12_Linear(int src_width,
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I010 to I410 and I012 to I412.
-void ScalePlaneUp2_12_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
+static void ScalePlaneUp2_12_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_16_Any_C;
@@ -1645,14 +1695,14 @@ void ScalePlaneUp2_12_Bilinear(int src_width,
   }
 }
 
-void ScalePlaneUp2_16_Linear(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static void ScalePlaneUp2_16_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                      int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   int i;
@@ -1694,14 +1744,14 @@ void ScalePlaneUp2_16_Linear(int src_width,
   }
 }
 
-void ScalePlaneUp2_16_Bilinear(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr) {
+static void ScalePlaneUp2_16_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleRowUp2_Bilinear_16_Any_C;
@@ -1741,15 +1791,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             enum FilterMode filtering) {
+static void ScalePlaneBilinearUp_16(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index ddd8d29e..1d5c1b60 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -16,6 +16,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -127,6 +128,15 @@ static void ScaleARGBDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_RVV
+                                          : ScaleARGBRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -184,6 +194,11 @@ static void ScaleARGBDown4Box(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_RVV;
+  }
+#endif
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
@@ -263,6 +278,12 @@ static void ScaleARGBDownEven(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven =
+        filtering ? ScaleARGBRowDownEvenBox_RVV : ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 77455903..d07a39af 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1964,35 +1964,6 @@ void ScaleSlope(int src_width,
 }
 #undef CENTERSTART
 
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint16_t* dst,
-                      int dst_width) {
-  const uint16_t* src2 = src_ptr + src_stride;
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
-    ++src_ptr;
-    ++src2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index ad06ee83..7c072380 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -1118,101 +1118,6 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
 
 #undef LOAD2_DATA8_LANE
 
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  int y_fraction = 256 - source_y_fraction;
-  asm volatile(
-      "cmp         %w4, #0                       \n"
-      "b.eq        100f                          \n"
-      "add         %2, %2, %1                    \n"
-      "cmp         %w4, #64                      \n"
-      "b.eq        75f                           \n"
-      "cmp         %w4, #128                     \n"
-      "b.eq        50f                           \n"
-      "cmp         %w4, #192                     \n"
-      "b.eq        25f                           \n"
-
-      "dup         v5.8b, %w4                    \n"
-      "dup         v4.8b, %w5                    \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "umull       v6.8h, v0.8b, v4.8b           \n"
-      "umull2      v7.8h, v0.16b, v4.16b         \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "umlal       v6.8h, v1.8b, v5.8b           \n"
-      "umlal2      v7.8h, v1.16b, v5.16b         \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "rshrn       v0.8b, v6.8h, #8              \n"
-      "rshrn2      v0.16b, v7.8h, #8             \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        1b                            \n"
-      "b           99f                           \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        25b                           \n"
-      "b           99f                           \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "ld1         {v1.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        50b                           \n"
-      "b           99f                           \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "ld1         {v1.16b}, [%1], #16           \n"
-      "ld1         {v0.16b}, [%2], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "urhadd      v0.16b, v0.16b, v1.16b        \n"
-      "prfm        pldl1keep, [%2, 448]          \n"
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        75b                           \n"
-      "b           99f                           \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1         {v0.16b}, [%1], #16           \n"
-      "subs        %w3, %w3, #16                 \n"
-      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v0.16b}, [%0], #16           \n"
-      "b.gt        100b                          \n"
-
-      "99:                                       \n"
-      "st1         {v0.b}[15], [%0]              \n"
-      : "+r"(dst_ptr),            // %0
-        "+r"(src_ptr),            // %1
-        "+r"(src_stride),         // %2
-        "+r"(dst_width),          // %3
-        "+r"(source_y_fraction),  // %4
-        "+r"(y_fraction)          // %5
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
-}
-
 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
new file mode 100644
index 00000000..fd14842d
--- /dev/null
+++ b/source/scale_rvv.cc
@@ -0,0 +1,1038 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_SCALEADDROW_RVV
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  size_t w = (size_t)src_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
+    // Use widening multiply-add instead of widening + add
+    v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
+    __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_RVV
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint64_t* src = (const uint64_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  do {
+    size_t vl = __riscv_vsetvl_e64m8(w);
+    vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
+    vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
+    __riscv_vse32_v_u32m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_odd, v_even, v_dst;
+    vuint32m4_t v_odd_32, v_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
+    v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
+    v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
+    vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
+    __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
+    v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
+    v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
+    v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
+    v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
+    v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * 2;
+    src1 += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  const int stride_byte = src_stepx * 4;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
+    __riscv_vse32_v_u32m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+
+#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  const int stride_byte = src_stepx * 4;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
+    vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
+                               stride_byte, vl);
+    __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
+                               stride_byte, vl);
+    v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
+    v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
+    v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
+    v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
+    v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * src_stepx;
+    src1 += vl * src_stepx;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_RVV
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_ptr;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
+    vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2LINEAR_RVV
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_s0, v_s1, v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src_ptr += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2BOX_RVV
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m8_t v_s01, v_t01, v_st01;
+    vuint8m4_t v_dst;
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
+    __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
+    v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    s += 2 * vl;
+    t += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4_RVV
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
+    w -= vl;
+    src_ptr += (4 * vl);
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4BOX_RVV
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint8m2_t v_v0, v_v1, v_v2, v_v3;
+    vuint16m4_t v_s01, v_s23, v_t01, v_t23;
+    vuint16m4_t v_u01, v_u23, v_v01, v_v23;
+    vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
+    vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
+    vuint8m2_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
+    v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
+    v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
+
+    v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
+    v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
+    v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
+    v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
+
+    v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
+    v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
+
+    v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
+    v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
+
+    v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
+    v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
+    v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    src_ptr1 += 4 * vl;
+    src_ptr2 += 4 * vl;
+    src_ptr3 += 4 * vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_RVV
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    if (src_stride == 0) {
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
+      t += 4 * vl;
+    }
+
+    v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
+    v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
+    v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
+    v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
+    v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
+    v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
+    v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
+
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    if (src_stride == 0) {
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
+      t += 4 * vl;
+    }
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_RVV
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 6u);
+  const uint16_t coeff_b = (65536u / 4u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f;
+    vuint16m2_t v_g0, v_g1, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    // Calculate sum of [e00, e21] to v_e
+    // Calculate sum of [f00, f21] to v_f
+    // Calculate sum of [g00, g11] to v_g
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 9u);
+  const uint16_t coeff_b = (65536u / 6u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+    vuint16m2_t v_g0, v_g1, v_g2, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    // u: e02, e12, e22, f02, f12, f22, g02, g12
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
+                            &v_u7, src_ptr + 2 * src_stride, vl);
+    // Calculate sum of [e00, e22]
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+    v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    // Calculate sum of [f00, f22]
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+    v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    // Calculate sum of [g00, g12]
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+    v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
+// platforms only implement non-edge part of image and process edge with scalar.
+
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  size_t work_width = (size_t)dst_width - 1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_src_ptr = src_ptr;
+  uint8_t* work_dst_ptr = dst_ptr + 1;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  while (src_width > 0) {
+    vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
+    vuint16m8_t v_src0_u16, v_src1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(src_width);
+    v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
+
+    v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
+    v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
+    v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
+    v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
+
+    v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
+    v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
+
+    src_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += 2 * vl;
+  }
+  dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint8_t* work_d = d + 1;
+  uint8_t* work_e = e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  while (src_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(src_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
+    __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
+
+    src_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += 2 * vl;
+    work_e += 2 * vl;
+  }
+  d[dst_width - 1] =
+      (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
+  e[dst_width - 1] =
+      (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2_RVV
+void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)src_uv;
+  uint16_t* dst = (uint16_t*)dst_uv;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
+    vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
+    __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_uv;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_u0v0, v_u1v1, v_avg;
+    vuint16m4_t v_u0v0_16, v_u1v1_16;
+    size_t vl = __riscv_vsetvl_e16m4(w);
+    __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
+    v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
+    v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
+    // Use round-to-nearest-up mode for averaging add
+    v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
+    __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
+    w -= vl;
+    src += vl * 2;
+    dst_uv += vl * 2;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_RVV
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  const uint8_t* src_uv_row1 = src_uv + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
+    vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
+    vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
+    vuint16m4_t v_sum0, v_sum1;
+    vuint8m2_t v_dst_u, v_dst_v;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
+                            src_uv, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
+                            src_uv_row1, vl);
+
+    v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
+    v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
+    v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
+    v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
+
+    v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
+    v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
+    v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
+
+    dst_uv += 2 * vl;
+    src_uv += 4 * vl;
+    w -= vl;
+    src_uv_row1 += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN4_RVV
+void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         int src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
+  // dst_width = src_width / 4 and src_width is also int.
+  size_t w = (size_t)dst_width * 8;
+  (void)src_stride;
+  (void)src_stepx;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
+    vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
+    // Narrowing without clipping
+    vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
+    vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
+    vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
+    __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
+    w -= vl;
+    src_uv += vl;
+    dst_uv += vl / 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWNEVEN_RVV
+void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  size_t w = (size_t)dst_width;
+  const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
+    __riscv_vse16_v_u16m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
+// Other platforms only implement non-edge part of image and process edge with
+// scalar.
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
+  const uint8_t* work_src_ptr = src_ptr;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  dst_ptr[1] = src_ptr[1];
+  while (work_width > 0) {
+    vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
+    vuint16m4_t v_dst_odd, v_dst_even;
+    vuint16m8_t v_uv0_u16, v_uv1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(work_width);
+    v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
+
+    v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
+    v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
+
+    v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
+    v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
+
+    v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
+    v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
+
+    v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
+    v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
+
+    work_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += vl;
+  }
+  dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
+  dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint16_t* work_d = (uint16_t*)d + 1;
+  uint16_t* work_e = (uint16_t*)e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  d[1] = (3 * s[1] + t[1] + 2) >> 2;
+  e[1] = (s[1] + 3 * t[1] + 2) >> 2;
+  while (work_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
+    vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(work_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
+    v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
+    v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
+    v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
+    __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
+
+    work_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += vl;
+    work_e += vl;
+  }
+  d[2 * dst_width - 2] =
+      (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  e[2 * dst_width - 2] =
+      (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  d[2 * dst_width - 1] =
+      (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+  e[2 * dst_width - 1] =
+      (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 1556071d..536b9436 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -128,6 +128,15 @@ static void ScaleUVDown2(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_RVV
+                                          : ScaleUVRowDown2Box_RVV);
+  }
+#endif
 
 // This code is not enabled.  Only box filter is available at this time.
 #if defined(HAS_SCALEUVROWDOWN2_SSSE3)
@@ -231,6 +240,11 @@ static void ScaleUVDown4Box(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_RVV;
+  }
+#endif
 
   for (j = 0; j < dst_height; ++j) {
     ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
@@ -310,6 +324,12 @@ static void ScaleUVDownEven(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV) && !filtering) {
+    ScaleUVRowDownEven =
+        (col_step == 4) ? ScaleUVRowDown4_RVV : ScaleUVRowDownEven_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -637,14 +657,14 @@ static void ScaleUVBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of NV16 to NV24.
-void ScaleUVLinearUp2(int src_width,
-                      int src_height,
-                      int dst_width,
-                      int dst_height,
-                      int src_stride,
-                      int dst_stride,
-                      const uint8_t* src_uv,
-                      uint8_t* dst_uv) {
+static void ScaleUVLinearUp2(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_uv,
+                             uint8_t* dst_uv) {
   void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
       ScaleUVRowUp2_Linear_Any_C;
   int i;
@@ -672,6 +692,12 @@ void ScaleUVLinearUp2(int src_width,
   }
 #endif
 
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_RVV;
+  }
+#endif
+
   if (dst_height == 1) {
     ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
                dst_width);
@@ -690,14 +716,14 @@ void ScaleUVLinearUp2(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of NV12 to NV24.
-void ScaleUVBilinearUp2(int src_width,
-                        int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t* src_ptr,
-                        uint8_t* dst_ptr) {
+static void ScaleUVBilinearUp2(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_ptr,
+                               uint8_t* dst_ptr) {
   void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                       uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleUVRowUp2_Bilinear_Any_C;
@@ -725,6 +751,12 @@ void ScaleUVBilinearUp2(int src_width,
   }
 #endif
 
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_RVV;
+  }
+#endif
+
   Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
   dst_ptr += dst_stride;
   for (x = 0; x < src_height - 1; ++x) {
@@ -744,14 +776,14 @@ void ScaleUVBilinearUp2(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of P210 to P410.
-void ScaleUVLinearUp2_16(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         int src_stride,
-                         int dst_stride,
-                         const uint16_t* src_uv,
-                         uint16_t* dst_uv) {
+static void ScaleUVLinearUp2_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_uv,
+                                uint16_t* dst_uv) {
   void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
       ScaleUVRowUp2_Linear_16_Any_C;
   int i;
@@ -797,14 +829,14 @@ void ScaleUVLinearUp2_16(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of P010 to P410.
-void ScaleUVBilinearUp2_16(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           int src_stride,
-                           int dst_stride,
-                           const uint16_t* src_ptr,
-                           uint16_t* dst_ptr) {
+static void ScaleUVBilinearUp2_16(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                       uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
       ScaleUVRowUp2_Bilinear_16_Any_C;