aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-24 13:45:04 -0700
committerFrank Barchard <fbarchard@chromium.org>2021-03-24 21:37:10 +0000
commit312c02a5aad4adda67cb2e0cc93a497d12845522 (patch)
treece776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon64.cc
parentd8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff)
downloadlibyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz
Fixes for SplitUVPlane_16 and MergeUVPlane_16
Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc278
1 files changed, 133 insertions, 145 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 3281e90f..903bf5cd 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -673,8 +673,8 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
asm volatile(
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store U
"st1 {v1.16b}, [%2], #16 \n" // store V
"b.gt 1b \n"
@@ -696,9 +696,9 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
: "+r"(src_u), // %0
@@ -719,8 +719,8 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
asm volatile(
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
@@ -746,12 +746,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"ld1 {v0.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
@@ -773,8 +772,8 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
@@ -804,11 +803,11 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
- "subs %w5, %w5, #16 \n" // 16 processed per loop
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
@@ -831,8 +830,8 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
@@ -859,10 +858,10 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
@@ -1072,9 +1071,9 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n"
@@ -1091,9 +1090,9 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
"movi v0.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v2.8b, v4.8b, v4.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"b.gt 1b \n"
@@ -1109,9 +1108,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
"b.gt 1b \n"
@@ -1143,8 +1142,8 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"movi v3.8b, #255 \n" // Alpha
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
@@ -1233,8 +1232,8 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
@@ -1252,8 +1251,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24
"b.gt 1b \n"
@@ -1269,9 +1268,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "prfm pldl1keep, [%0, 448] \n"
"orr v5.8b, v1.8b, v1.8b \n" // mov b
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
"b.gt 1b \n"
@@ -1287,8 +1286,8 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1303,8 +1302,8 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1322,8 +1321,8 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
@@ -1343,8 +1342,8 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"b.gt 1b \n"
@@ -1366,10 +1365,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
@@ -1394,10 +1393,10 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
@@ -1422,8 +1421,8 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"st1 {v1.16b}, [%1], #16 \n" // store 4.
"b.gt 1b \n"
@@ -1443,11 +1442,11 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
asm volatile(
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels
"orr v2.8b, v1.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -1467,8 +1466,8 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
asm volatile(
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "prfm pldl1keep, [%0, 448] \n"
"orr v3.8b, v2.8b, v2.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
@@ -1490,8 +1489,8 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
ARGBTORGB565
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
@@ -1511,9 +1510,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
// pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
@@ -1532,8 +1531,8 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
@@ -1553,8 +1552,8 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
@@ -1575,8 +1574,8 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
"1: \n"
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"mov v1.16b, v0.16b \n"
- "mov v3.16b, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n"
+ "mov v3.16b, v2.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
@@ -1597,9 +1596,9 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v2.16b, {v2.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"mov v1.16b, v0.16b \n"
"mov v3.16b, v2.16b \n"
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
@@ -1622,8 +1621,8 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
- "prfm pldl1keep, [%0, 448] \n"
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
@@ -1646,8 +1645,8 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
- "prfm pldl1keep, [%0, 448] \n"
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
@@ -1667,9 +1666,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
@@ -1708,9 +1707,9 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
@@ -1730,9 +1729,9 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v1.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v2.8b, v5.8b \n" // G
"umlal v0.8h, v3.8b, v6.8b \n" // R
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
@@ -1760,9 +1759,9 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
"movi v29.16b,#0x80 \n" // 128.5
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
"add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
@@ -1823,14 +1822,14 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
@@ -1869,13 +1868,13 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
@@ -1909,13 +1908,13 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
@@ -1949,13 +1948,13 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
@@ -1989,13 +1988,13 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
@@ -2029,13 +2028,13 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
RGBTOUV_SETUP_REG
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
@@ -2069,13 +2068,13 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
RGBTOUV_SETUP_REG
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
@@ -2110,9 +2109,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
RGBTOUV_SETUP_REG
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
@@ -2122,9 +2121,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%1, 448] \n"
RGB565TOARGB
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
@@ -2168,9 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
RGBTOUV_SETUP_REG
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%0, 448] \n"
RGB555TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
@@ -2180,9 +2179,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%1, 448] \n"
RGB555TOARGB
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
@@ -2226,9 +2225,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
@@ -2238,9 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%1, 448] \n"
ARGB4444TOARGB
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
@@ -2283,10 +2282,10 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v25.8b \n" // G
"umlal v3.8h, v2.8b, v26.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2311,10 +2310,10 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
"umull v3.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2338,10 +2337,10 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v25.8b \n" // G
"umlal v3.8h, v2.8b, v26.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2363,9 +2362,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2387,9 +2386,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2411,9 +2410,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2435,9 +2434,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2459,9 +2458,9 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2482,9 +2481,9 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v1.8b, v5.8b \n" // G
"umlal v0.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
@@ -2504,11 +2503,11 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"movi v4.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // R
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v1.8b, v5.8b \n" // G
- "umlal v0.8h, v2.8b, v6.8b \n" // B
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"
@@ -2540,11 +2539,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
"1: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
"subs %w3, %w3, #16 \n"
"umull v2.8h, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"umull2 v3.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
"umlal v2.8h, v1.8b, v5.8b \n"
"umlal2 v3.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v2.8h, #8 \n"
@@ -2557,10 +2556,10 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
"50: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
- "prfm pldl1keep, [%2, 448] \n"
"subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n"
"b 99f \n"
@@ -2568,8 +2567,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"ld1 {v0.16b}, [%1], #16 \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
@@ -2596,11 +2595,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
"8: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
@@ -2626,11 +2625,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
// ARGB0.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
// ARGB1.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
@@ -2664,9 +2663,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
// Attenuate 8 pixels.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "prfm pldl1keep, [%0, 448] \n"
"umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
@@ -2697,9 +2696,9 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n"
"sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
@@ -2739,9 +2738,9 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// 8 pixel loop.
"1: \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl v5.8h, v5.8b \n"
"uxtl v6.8h, v6.8b \n"
"uxtl v7.8h, v7.8b \n"
@@ -2772,9 +2771,9 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"movi v26.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R
"uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
@@ -2807,9 +2806,9 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"movi v30.8b, #50 \n" // BR coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "prfm pldl1keep, [%0, 448] \n"
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "prfm pldl1keep, [%0, 448] \n"
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
"umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
@@ -2844,9 +2843,9 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl v17.8h, v17.8b \n" // g
"uxtl v18.8h, v18.8b \n" // r
"uxtl v19.8h, v19.8b \n" // a
@@ -2903,11 +2902,11 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "prfm pldl1keep, [%0, 448] \n"
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "prfm pldl1keep, [%1, 448] \n"
"umull v2.8h, v2.8b, v6.8b \n" // multiply R
"umull v3.8h, v3.8b, v7.8b \n" // multiply A
"rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
@@ -2934,11 +2933,11 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
@@ -2961,11 +2960,11 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
@@ -2993,11 +2992,11 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
"1: \n"
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "prfm pldl1keep, [%0, 448] \n"
"orr v1.8b, v0.8b, v0.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
@@ -3019,10 +3018,10 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
"uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
@@ -3048,10 +3047,10 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
"1: \n"
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
"uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
@@ -3075,18 +3074,18 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"1: \n"
"ld1 {v0.8b}, [%0],%5 \n" // top
"ld1 {v1.8b}, [%0],%6 \n"
- "prfm pldl1keep, [%0, 448] \n"
"usubl v0.8h, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"ld1 {v2.8b}, [%1],%5 \n" // center * 2
"ld1 {v3.8b}, [%1],%6 \n"
- "prfm pldl1keep, [%1, 448] \n"
"usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
"add v0.8h, v0.8h, v1.8h \n"
"add v0.8h, v0.8h, v1.8h \n"
"ld1 {v2.8b}, [%2],%5 \n" // bottom
"ld1 {v3.8b}, [%2],%6 \n"
- "prfm pldl1keep, [%2, 448] \n"
"subs %w4, %w4, #8 \n" // 8 pixels
+ "prfm pldl1keep, [%2, 448] \n"
"usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n"
@@ -3124,11 +3123,11 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"add v0.8h, v0.8h, v1.8h \n"
"ld1 {v2.8b}, [%0],%5 \n" // right
"ld1 {v3.8b}, [%1],%5 \n"
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"add v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
"abs v0.8h, v0.8h \n"
"uqxtn v0.8b, v0.8h \n"
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
@@ -3151,9 +3150,9 @@ void HalfFloat1Row_NEON(const uint16_t* src,
asm volatile(
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v3.4s, v3.4s \n"
@@ -3175,9 +3174,9 @@ void HalfFloatRow_NEON(const uint16_t* src,
asm volatile(
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v3.4s, v3.4s \n"
@@ -3201,9 +3200,9 @@ void ByteToFloatRow_NEON(const uint8_t* src,
asm volatile(
"1: \n"
"ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v1.8h, v1.8b \n" // 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
"uxtl v2.4s, v1.4h \n" // 8 ints
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
@@ -3230,9 +3229,9 @@ float ScaleMaxSamples_NEON(const float* src,
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
"fmul v4.4s, v2.4s, %4.s[0] \n" // scale
"fmax v5.4s, v5.4s, v1.4s \n" // max
"fmax v6.4s, v6.4s, v2.4s \n"
@@ -3260,9 +3259,9 @@ float ScaleSumSamples_NEON(const float* src,
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
"fmul v4.4s, v2.4s, %4.s[0] \n"
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
@@ -3470,10 +3469,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
- "prfm pldl1keep, [%0, 448] \n"
- "prfm pldl1keep, [%1, 448] \n"
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "prfm pldl1keep, [%0, 448] \n"
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n"
@@ -3494,12 +3493,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v2.8b, v1.8h, #2 \n"
@@ -3523,12 +3522,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
- "prfm pldl1keep, [%0, 448] \n"
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v1.8b, v1.8h, #2 \n"
@@ -3548,8 +3547,8 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "prfm pldl1keep, [%0, 448] \n"
"st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
@@ -3570,9 +3569,9 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
"1: \n"
"ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
"ld1 {v1.16b}, [%0], 16 \n"
- "prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"tbl v0.16b, {v0.16b}, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
"tbl v1.16b, {v1.16b}, v2.16b \n"
"stp q0, q1, [%1], 32 \n" // store 16 VU pixels
"b.gt 1b \n"
@@ -3625,34 +3624,24 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
uint16_t* dst_v,
int depth,
int width) {
+ int shift = depth - 16; // Negative for right shift.
asm volatile(
- "dup v0.4s, %w3 \n"
+ "dup v2.8h, %w4 \n"
"1: \n"
- "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV
+ "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ushl v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
- "ushll v3.4s, v1.4h, #0 \n"
- "ushll2 v4.4s, v1.8h, #0 \n"
- "ushl v3.4s, v3.4s, v0.4s \n"
- "ushl v4.4s, v4.4s, v0.4s \n"
- "xtn v1.4h, v3.4s \n"
- "xtn2 v1.8h, v4.4s \n"
- "ushll v3.4s, v2.4h, #0 \n"
- "ushll2 v4.4s, v2.8h, #0 \n"
- "ushl v3.4s, v3.4s, v0.4s \n"
- "ushl v4.4s, v4.4s, v0.4s \n"
- "xtn v2.4h, v3.4s \n"
- "xtn2 v2.8h, v4.4s \n"
- "subs %w4, %w4, #8 \n" // 8 src pixels per loop
- "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels
- "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels
+ "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(depth), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
@@ -3662,23 +3651,22 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int width) {
int shift = 16 - depth;
asm volatile(
- "dup v2.8h, %w3 \n"
+ "dup v2.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
- "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
- "prfm pldl1keep, [%1, 448] \n"
"ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v2.8h \n"
- "subs %w4, %w4, #8 \n" // 8 src pixels per loop
+ "prfm pldl1keep, [%1, 448] \n"
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
- "+r"(shift), // %3
- "+r"(width) // %4
- :
+ "+r"(width) // %3
+ : "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2");
}
@@ -3690,8 +3678,8 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
"dup v2.8h, %w2 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
- "prfm pldl1keep, [%0, 448] \n"
"mul v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
"mul v1.8h, v1.8h, v2.8h \n"
"stp q0, q1, [%1] \n" // store 16 pixels
"add %1, %1, #32 \n"
@@ -3713,9 +3701,9 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"dup v0.8h, %w2 \n"
"1: \n"
"ldp q1, q2, [%0], #32 \n"
- "prfm pldl1keep, [%0, 448] \n"
"ushll v3.4s, v1.4h, #0 \n"
"ushll v4.4s, v2.4h, #0 \n"
+ "prfm pldl1keep, [%0, 448] \n"
"ushll2 v1.4s, v1.8h, #0 \n"
"ushll2 v2.4s, v2.8h, #0 \n"
"mul v3.4s, v0.4s, v3.4s \n"