diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-24 13:45:04 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-03-24 21:37:10 +0000 |
commit | 312c02a5aad4adda67cb2e0cc93a497d12845522 (patch) | |
tree | ce776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon.cc | |
parent | d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff) | |
download | libyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz |
Fixes for SplitUVPlane_16 and MergeUVPlane_16
Planar functions pass depth instead of scale factor.
Row functions pass shift instead of depth. Add assert to C.
AVX shift instruction expects a single shift value in XMM.
Neon pass shift as input (not output).
Split Neon reimplemented as left shift on shorts by negative to achieve right shift.
Add planar unitests
Bug: libyuv:888
Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r-- | source/row_neon.cc | 39 |
1 files changed, 14 insertions, 25 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc index 5d109a3b..2165d0d0 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "vdup.32 q0, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" - "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q4, d3 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q4 \n" - "vmovl.u16 q3, d4 \n" - "vmovl.u16 q4, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d4, q3 \n" - "vmovn.u32 d5, q4 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop - "vst1.16 {q1}, [%1]! \n" // store 8 U pixels - "vst1.16 {q2}, [%2]! \n" // store 8 V pixels + "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst1.16 {q0}, [%1]! \n" // store 8 U pixels + "vst1.16 {q1}, [%2]! \n" // store 8 V pixels "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } @@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "vdup.16 q2, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop + "subs %3, %3, #8 \n" // 8 src pixels per loop "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2"); } |