Fixes for SplitUVPlane_16 and MergeUVPlane_16

Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
author: Frank Barchard <fbarchard@google.com> 2021-03-24 13:45:04 -0700
committer: Frank Barchard <fbarchard@chromium.org> 2021-03-24 21:37:10 +0000
commit: 312c02a5aad4adda67cb2e0cc93a497d12845522 (patch)
tree: ce776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon.cc
parent: d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff)
download: libyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz
1 files changed, 14 insertions, 25 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 5d109a3b..2165d0d0 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         uint16_t* dst_v,
                         int depth,
                         int width) {
+  int shift = depth - 16;  // Negative for right shift.
   asm volatile(
-      "vdup.32     q0, %3                        \n"
+      "vdup.16     q2, %4                        \n"
       "1:                                        \n"
-      "vld2.16     {q1, q2}, [%0]!               \n"  // load 8 UV
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q4, d3                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q4                        \n"
-      "vmovl.u16   q3, d4                        \n"
-      "vmovl.u16   q4, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d4, q3                        \n"
-      "vmovn.u32   d5, q4                        \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
-      "vst1.16     {q1}, [%1]!                   \n"  // store 8 U pixels
-      "vst1.16     {q2}, [%2]!                   \n"  // store 8 V pixels
+      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
+      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
       "bgt         1b                            \n"
       : "+r"(src_uv),  // %0
         "+r"(dst_u),   // %1
         "+r"(dst_v),   // %2
-        "+r"(depth),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
 }
 
@@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int width) {
   int shift = 16 - depth;
   asm volatile(
-      "vdup.16     q2, %3                        \n"
+      "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
       "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
       "vshl.u16    q0, q0, q2                    \n"
       "vshl.u16    q1, q1, q2                    \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
       "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
       "bgt         1b                            \n"
       : "+r"(src_u),   // %0
         "+r"(src_v),   // %1
         "+r"(dst_uv),  // %2
-        "+r"(shift),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
       : "cc", "memory", "q0", "q1", "q2");
 }
author	Frank Barchard <fbarchard@google.com>	2021-03-24 13:45:04 -0700
committer	Frank Barchard <fbarchard@chromium.org>	2021-03-24 21:37:10 +0000
commit	312c02a5aad4adda67cb2e0cc93a497d12845522 (patch)
tree	ce776a4db30d2319fad3bbf41fe48d6cdf1e2602 /source/row_neon.cc
parent	d8f1bfc9816a9fc76f3a25cc0ee272fb9c07622a (diff)
download	libyuv-312c02a5aad4adda67cb2e0cc93a497d12845522.tar.gz