diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-24 13:45:04 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-03-24 21:37:10 +0000 |
commit | 312c02a5aad4adda67cb2e0cc93a497d12845522 (patch) | |
tree | ce776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon64.cc | |
parent | d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff) | |
download | libyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz |
Fixes for SplitUVPlane_16 and MergeUVPlane_16
Planar functions pass depth instead of scale factor.
Row functions pass shift instead of depth. Add assert to C.
AVX shift instruction expects a single shift value in XMM.
Neon pass shift as input (not output).
Split Neon reimplemented as left shift on shorts by negative to achieve right shift.
Add planar unitests
Bug: libyuv:888
Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 278 |
1 files changed, 133 insertions, 145 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3281e90f..903bf5cd 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -673,8 +673,8 @@ void SplitUVRow_NEON(const uint8_t* src_uv, asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v1.16b}, [%2], #16 \n" // store V "b.gt 1b \n" @@ -696,9 +696,9 @@ void MergeUVRow_NEON(const uint8_t* src_u, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 @@ -719,8 +719,8 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, asm volatile( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store R "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%3], #16 \n" // store B @@ -746,12 +746,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,8 +772,8 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -804,11 +803,11 @@ void MergeARGBRow_NEON(const uint8_t* src_r, "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -831,8 +830,8 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -859,10 +858,10 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1072,9 +1071,9 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" @@ -1091,9 +1090,9 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v2.8b, v4.8b, v4.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v5.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" @@ -1109,9 +1108,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r "b.gt 1b \n" @@ -1143,8 +1142,8 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -1233,8 +1232,8 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -1252,8 +1251,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of // RGB24 "b.gt 1b \n" @@ -1269,9 +1268,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v4.8b, v2.8b, v2.8b \n" // mov g + "prfm pldl1keep, [%0, 448] \n" "orr v5.8b, v1.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" @@ -1287,8 +1286,8 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -1303,8 +1302,8 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -1322,8 +1321,8 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1343,8 +1342,8 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1366,10 +1365,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V "st1 {v1.8b}, [%2], #8 \n" // store 8 U. "st1 {v3.8b}, [%3], #8 \n" // store 8 V. @@ -1394,10 +1393,10 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V "st1 {v0.8b}, [%2], #8 \n" // store 8 U. "st1 {v2.8b}, [%3], #8 \n" // store 8 V. @@ -1422,8 +1421,8 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #4 \n" // 4 processed per loop + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "st1 {v1.16b}, [%1], #16 \n" // store 4. "b.gt 1b \n" @@ -1443,11 +1442,11 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels "orr v2.8b, v1.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -1467,8 +1466,8 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" "orr v3.8b, v2.8b, v2.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %w4, %w4, #16 \n" // 16 pixels @@ -1490,8 +1489,8 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" @@ -1511,9 +1510,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v20.8b, v20.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqadd v21.8b, v21.8b, v1.8b \n" "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. @@ -1532,8 +1531,8 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" @@ -1553,8 +1552,8 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" @@ -1575,8 +1574,8 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "mov v1.16b, v0.16b \n" - "mov v3.16b, v2.16b \n" "prfm pldl1keep, [%0, 448] \n" + "mov v3.16b, v2.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels @@ -1597,9 +1596,9 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "ldp q0, q2, [%0], #32 \n" // load 8 pixels "tbl v0.16b, {v0.16b}, v4.16b \n" "tbl v2.16b, {v2.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "mov v1.16b, v0.16b \n" "mov v3.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels @@ -1622,8 +1621,8 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "prfm pldl1keep, [%0, 448] \n" "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels @@ -1646,8 +1645,8 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "prfm pldl1keep, [%0, 448] \n" "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels @@ -1667,9 +1666,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1708,9 +1707,9 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1730,9 +1729,9 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v2.8b, v5.8b \n" // G "umlal v0.8h, v3.8b, v6.8b \n" // R "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -1760,9 +1759,9 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "movi v29.16b,#0x80 \n" // 128.5 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned @@ -1823,14 +1822,14 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1869,13 +1868,13 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1909,13 +1908,13 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. @@ -1949,13 +1948,13 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -1989,13 +1988,13 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. @@ -2029,13 +2028,13 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -2069,13 +2068,13 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -2110,9 +2109,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. @@ -2122,9 +2121,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB565TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. @@ -2168,9 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB555TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. @@ -2180,9 +2179,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB555TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. @@ -2226,9 +2225,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. @@ -2238,9 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%1, 448] \n" ARGB4444TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. @@ -2283,10 +2282,10 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2311,10 +2310,10 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2338,10 +2337,10 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2363,9 +2362,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2387,9 +2386,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2411,9 +2410,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2435,9 +2434,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2459,9 +2458,9 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2482,9 +2481,9 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -2504,11 +2503,11 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "movi v4.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // R + "umull v0.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // B + "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" @@ -2540,11 +2539,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" "umull v2.8h, v0.8b, v4.8b \n" + "prfm pldl1keep, [%1, 448] \n" "umull2 v3.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%2, 448] \n" "umlal v2.8h, v1.8b, v5.8b \n" "umlal2 v3.8h, v1.16b, v5.16b \n" "rshrn v0.8b, v2.8h, #8 \n" @@ -2557,10 +2556,10 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 50b \n" "b 99f \n" @@ -2568,8 +2567,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, // Blend 100 / 0 - Copy row unchanged. "100: \n" "ld1 {v0.16b}, [%1], #16 \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -2596,11 +2595,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2626,11 +2625,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, // ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel // ARGB1. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2664,9 +2663,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a + "prfm pldl1keep, [%0, 448] \n" "umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v6.8h, v2.8b, v3.8b \n" // r * a "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 @@ -2697,9 +2696,9 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "prfm pldl1keep, [%0, 448] \n" "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale @@ -2739,9 +2738,9 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // 8 pixel loop. "1: \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" "uxtl v5.8h, v5.8b \n" "uxtl v6.8h, v6.8b \n" "uxtl v7.8h, v7.8b \n" @@ -2772,9 +2771,9 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "movi v26.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B @@ -2807,9 +2806,9 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "movi v30.8b, #50 \n" // BR coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G @@ -2844,9 +2843,9 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "prfm pldl1keep, [%0, 448] \n" "uxtl v17.8h, v17.8b \n" // g "uxtl v18.8h, v18.8b \n" // r "uxtl v19.8h, v19.8b \n" // a @@ -2903,11 +2902,11 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "prfm pldl1keep, [%0, 448] \n" "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "prfm pldl1keep, [%1, 448] \n" "umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v3.8h, v3.8b, v7.8b \n" // multiply A "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B @@ -2934,11 +2933,11 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB @@ -2961,11 +2960,11 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqsub v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB @@ -2993,11 +2992,11 @@ void SobelRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add + "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v0.8b, v0.8b \n" + "prfm pldl1keep, [%1, 448] \n" "orr v2.8b, v0.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -3019,10 +3018,10 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v0.16b, v0.16b, v1.16b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -3048,10 +3047,10 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v0.8b, v2.8b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -3075,18 +3074,18 @@ void SobelXRow_NEON(const uint8_t* src_y0, "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" - "prfm pldl1keep, [%0, 448] \n" "usubl v0.8h, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v2.8b}, [%1],%5 \n" // center * 2 "ld1 {v3.8b}, [%1],%6 \n" - "prfm pldl1keep, [%1, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%1, 448] \n" "add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v3.8b}, [%2],%6 \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w4, %w4, #8 \n" // 8 pixels + "prfm pldl1keep, [%2, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" @@ -3124,11 +3123,11 @@ void SobelYRow_NEON(const uint8_t* src_y0, "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v3.8b}, [%1],%5 \n" - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" "add v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" "abs v0.8h, v0.8h \n" "uqxtn v0.8b, v0.8h \n" "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely @@ -3151,9 +3150,9 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3175,9 +3174,9 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3201,9 +3200,9 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v1.8h, v1.8b \n" // 8 shorts + "prfm pldl1keep, [%0, 448] \n" "uxtl v2.4s, v1.4h \n" // 8 ints "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats @@ -3230,9 +3229,9 @@ float ScaleMaxSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" // scale "fmax v5.4s, v5.4s, v1.4s \n" // max "fmax v6.4s, v6.4s, v2.4s \n" @@ -3260,9 +3259,9 @@ float ScaleSumSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v6.4s, v2.4s, v2.4s \n" @@ -3470,10 +3469,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "prfm pldl1keep, [%0, 448] \n" "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" @@ -3494,12 +3493,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" @@ -3523,12 +3522,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" @@ -3548,8 +3547,8 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels "b.gt 1b \n" : "+r"(src_ayuv), // %0 @@ -3570,9 +3569,9 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop "tbl v0.16b, {v0.16b}, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v1.16b}, v2.16b \n" "stp q0, q1, [%1], 32 \n" // store 16 VU pixels "b.gt 1b \n" @@ -3625,34 +3624,24 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "dup v0.4s, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" - "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV + "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll2 v4.4s, v1.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v1.4h, v3.4s \n" - "xtn2 v1.8h, v4.4s \n" - "ushll v3.4s, v2.4h, #0 \n" - "ushll2 v4.4s, v2.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v2.4h, v3.4s \n" - "xtn2 v2.8h, v4.4s \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop - "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels - "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels + "ushl v1.8h, v1.8h, v2.8h \n" + "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); } void MergeUVRow_16_NEON(const uint16_t* src_u, @@ -3662,23 +3651,22 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "dup v2.8h, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "prfm pldl1keep, [%1, 448] \n" "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop + "prfm pldl1keep, [%1, 448] \n" "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2"); } @@ -3690,8 +3678,8 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, "dup v2.8h, %w2 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" "mul v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" "stp q0, q1, [%1] \n" // store 16 pixels "add %1, %1, #32 \n" @@ -3713,9 +3701,9 @@ void DivideRow_16_NEON(const uint16_t* src_y, "dup v0.8h, %w2 \n" "1: \n" "ldp q1, q2, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" "ushll v3.4s, v1.4h, #0 \n" "ushll v4.4s, v2.4h, #0 \n" + "prfm pldl1keep, [%0, 448] \n" "ushll2 v1.4s, v1.8h, #0 \n" "ushll2 v2.4s, v2.8h, #0 \n" "mul v3.4s, v0.4s, v3.4s \n" |