3 files changed, 37 insertions, 37 deletions
diff --git a/source/convert.cc b/source/convert.cc
index a41974a2..9a263536 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -24,7 +24,10 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// subsample amount uses a shift.
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
@@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
   if (width <= 0 || height == 0) {
     return -1;
   }
-
   ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
                        depth);
   SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 0611b9aa..416f112f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "vdup.16     q0, %3                        \n"
-      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q1, d3                        \n"
-      "vmovl.u16   q4, d4                        \n"
-      "vmovl.u16   q2, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vshl.u32    q1, q1, q0                    \n"
-      "vshl.u32    q2, q2, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q1                        \n"
-      "vmovn.u32   d4, q4                        \n"
-      "vmovn.u32   d5, q2                        \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "vst1.16     {q2}, [%1]!                   \n"
+      "vdup.16     q6, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0, q1}, [%0]!               \n"
+      "vmovl.u16   q2, d0                        \n"
+      "vmovl.u16   q3, d1                        \n"
+      "vmovl.u16   q4, d2                        \n"
+      "vmovl.u16   q5, d3                        \n"
+      "vmul.u32    q2, q2, q6                    \n"
+      "vmul.u32    q3, q3, q6                    \n"
+      "vmul.u32    q4, q4, q6                    \n"
+      "vmul.u32    q5, q5, q6                    \n"
+      "vshrn.u32   d0, q2, #16                   \n"
+      "vshrn.u32   d1, q3, #16                   \n"
+      "vshrn.u32   d2, q4, #16                   \n"
+      "vshrn.u32   d3, q5, #16                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are:
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 41289fe9..088174c8 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "dup         v0.8h, %w3                    \n"
+      "dup         v6.8h, %w3                    \n"
       "1:                                        \n"
-      "ldp         q1, q2, [%0], #32             \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll       v4.4s, v2.4h, #0              \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "ushll       v2.4s, v0.4h, #0              \n"
+      "ushll2      v3.4s, v0.8h, #0              \n"
+      "ushll       v4.4s, v1.4h, #0              \n"
+      "ushll2      v5.4s, v1.8h, #0              \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll2      v1.4s, v1.8h, #0              \n"
-      "ushll2      v2.4s, v2.8h, #0              \n"
-      "mul         v3.4s, v0.4s, v3.4s           \n"
-      "mul         v4.4s, v0.4s, v4.4s           \n"
-      "mul         v1.4s, v0.4s, v1.4s           \n"
-      "mul         v2.4s, v0.4s, v2.4s           \n"
-      "shrn        v3.4h, v3.4s, #16             \n"
-      "shrn        v4.4h, v4.4s, #16             \n"
-      "shrn2       v3.8h, v1.4s, #16             \n"
-      "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1], #32             \n"  // store 16 pixels
+      "mul         v2.4s, v2.4s, v6.4s           \n"
+      "mul         v3.4s, v3.4s, v6.4s           \n"
+      "mul         v4.4s, v4.4s, v6.4s           \n"
+      "mul         v5.4s, v5.4s, v6.4s           \n"
+      "shrn        v0.4h, v2.4s, #16             \n"
+      "shrn2       v0.8h, v3.4s, #16             \n"
+      "shrn        v1.4h, v4.4s, #16             \n"
+      "shrn2       v1.8h, v5.4s, #16             \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are: