aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-24 13:45:04 -0700
committerFrank Barchard <fbarchard@chromium.org>2021-03-24 21:37:10 +0000
commit312c02a5aad4adda67cb2e0cc93a497d12845522 (patch)
treece776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon.cc
parentd8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff)
downloadlibyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz
Fixes for SplitUVPlane_16 and MergeUVPlane_16
Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r--source/row_neon.cc39
1 files changed, 14 insertions, 25 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 5d109a3b..2165d0d0 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
uint16_t* dst_v,
int depth,
int width) {
+ int shift = depth - 16; // Negative for right shift.
asm volatile(
- "vdup.32 q0, %3 \n"
+ "vdup.16 q2, %4 \n"
"1: \n"
- "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV
- "vmovl.u16 q3, d2 \n"
- "vmovl.u16 q4, d3 \n"
- "vshl.u32 q3, q3, q0 \n"
- "vshl.u32 q4, q4, q0 \n"
- "vmovn.u32 d2, q3 \n"
- "vmovn.u32 d3, q4 \n"
- "vmovl.u16 q3, d4 \n"
- "vmovl.u16 q4, d5 \n"
- "vshl.u32 q3, q3, q0 \n"
- "vshl.u32 q4, q4, q0 \n"
- "vmovn.u32 d4, q3 \n"
- "vmovn.u32 d5, q4 \n"
- "subs %4, %4, #8 \n" // 8 src pixels per loop
- "vst1.16 {q1}, [%1]! \n" // store 8 U pixels
- "vst1.16 {q2}, [%2]! \n" // store 8 V pixels
+ "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
+ "vshl.u16 q0, q0, q2 \n"
+ "vshl.u16 q1, q1, q2 \n"
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
+ "vst1.16 {q0}, [%1]! \n" // store 8 U pixels
+ "vst1.16 {q1}, [%2]! \n" // store 8 V pixels
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(depth), // %3
- "+r"(width) // %4
- :
+ "+r"(width) // %3
+ : "r"(shift) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
}
@@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int width) {
int shift = 16 - depth;
asm volatile(
- "vdup.16 q2, %3 \n"
+ "vdup.16 q2, %4 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U
"vld1.16 {q1}, [%1]! \n" // load 8 V
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
- "subs %4, %4, #8 \n" // 8 src pixels per loop
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
"bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
- "+r"(shift), // %3
- "+r"(width) // %4
- :
+ "+r"(width) // %3
+ : "r"(shift) // %4
: "cc", "memory", "q0", "q1", "q2");
}