aboutsummaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-23 15:54:02 -0700
committerFrank Barchard <fbarchard@chromium.org>2021-03-23 23:45:54 +0000
commitd8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (patch)
tree6201f9cab35550653480bc372580d2c5014d074d /source
parentb046131c0bd44ca3a11276194d07b85373cfd608 (diff)
downloadlibyuv-d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a.tar.gz
Add RAWToJ420
Add J420 output from RAW. Optimize RGB24 and RAW To J420 on ARM by using NEON for the 2 step conversion. Also fix sign-compare warning that was breaking Windows build Bug: libyuv:887, b/183534734 Change-Id: I8c39334552dc0b28414e638708db413d6adf8d6e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2783382 Reviewed-by: Wan-Teh Chang <wtc@google.com>
Diffstat (limited to 'source')
-rw-r--r--source/convert.cc262
-rw-r--r--source/convert_argb.cc12
-rw-r--r--source/row_gcc.cc30
-rw-r--r--source/row_neon.cc6
-rw-r--r--source/row_neon64.cc12
5 files changed, 273 insertions, 49 deletions
diff --git a/source/convert.cc b/source/convert.cc
index 1bd59659..768e0f37 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1368,6 +1368,18 @@ int ARGBToI420(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1388,22 +1400,6 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
@@ -1771,7 +1767,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
+#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
RGB24ToYRow = RGB24ToYRow_Any_NEON;
@@ -1808,6 +1804,14 @@ int RGB24ToI420(const uint8_t* src_rgb24,
#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1816,6 +1820,18 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1960,6 +1976,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
#endif
#else
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1968,6 +1992,18 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
@@ -2111,6 +2147,26 @@ int RAWToI420(const uint8_t* src_raw,
#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -2186,6 +2242,178 @@ int RAWToI420(const uint8_t* src_raw,
return 0;
}
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RAW to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)
+ void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RAWToUVJRow_C;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYJRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVJRow = RAWToUVJRow_Any_NEON;
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVJRow = RAWToUVJRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA))
+#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MMI;
+ RAWToYJRow = RAWToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVJRow = RAWToUVJRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MSA;
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ RAWToUVJRow = RAWToUVJRow_MSA;
+ }
+ }
+#endif
+#else
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+ RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+ RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+ RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
+ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
// Convert RGB565 to I420.
LIBYUV_API
int RGB565ToI420(const uint8_t* src_rgb565,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 0bd330ec..16b5ff92 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -3376,14 +3376,14 @@ int AR30ToAB30(const uint8_t* src_ar30,
// Convert AR64 to ARGB.
LIBYUV_API
int AR64ToARGB(const uint16_t* src_ar64,
- int src_stride_ar64,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+ int src_stride_ar64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
- int width) = AR64ToARGBRow_C;
+ int width) = AR64ToARGBRow_C;
if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 49d45397..1b4ad9b0 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1116,8 +1116,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
asm volatile(
"movdqa %3,%%xmm2 \n"
- "movdqa %4,%%xmm3 \n"
- LABELALIGN
+ "movdqa %4,%%xmm3 \n" LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -1129,11 +1128,11 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
"lea 0x20(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_ab64), // %1
- "+r"(width) // %2
- : "m"(kShuffleARGBToAB64Lo), // %3
- "m"(kShuffleARGBToAB64Hi) // %4
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToAB64Lo), // %3
+ "m"(kShuffleARGBToAB64Hi) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
@@ -1166,8 +1165,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64,
int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n"
- LABELALIGN
+ "movdqa %3,%%xmm2 \n" LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@@ -1220,8 +1218,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n"
- "vbroadcastf128 %4,%%ymm3 \n"
- LABELALIGN
+ "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
@@ -1233,11 +1230,11 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
"lea 0x40(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_ab64), // %1
- "+r"(width) // %2
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
: "m"(kShuffleARGBToAB64Lo), // %3
- "m"(kShuffleARGBToAB64Hi) // %3
+ "m"(kShuffleARGBToAB64Hi) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
@@ -1275,8 +1272,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ar64,
int width) {
asm volatile(
- "vbroadcastf128 %3,%%ymm2 \n"
- LABELALIGN
+ "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 5414d1ef..5d109a3b 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2191,7 +2191,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
: "cc", "memory", "q0", "q1", "q2", "q3");
}
-static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
uint8_t* dst_argb,
@@ -2362,9 +2362,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q4, d0, d4 \n" // B
+ "vmull.u8 q4, d0, d4 \n" // R
"vmlal.u8 q4, d1, d5 \n" // G
- "vmlal.u8 q4, d2, d6 \n" // R
+ "vmlal.u8 q4, d2, d6 \n" // B
"vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 9662cd3c..3281e90f 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1628,10 +1628,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
"b.gt 1b \n"
- : "+r"(src_ar64), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAR64ToARGB) // %3
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAR64ToARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
@@ -2506,9 +2506,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umull v0.8h, v0.8b, v4.8b \n" // R
"umlal v0.8h, v1.8b, v5.8b \n" // G
- "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "umlal v0.8h, v2.8b, v6.8b \n" // B
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"