diff options
-rw-r--r-- | source/convert.cc | 6 | ||||
-rw-r--r-- | source/row_neon.cc | 36 | ||||
-rw-r--r-- | source/row_neon64.cc | 32 |
3 files changed, 37 insertions, 37 deletions
diff --git a/source/convert.cc b/source/convert.cc index a41974a2..9a263536 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -24,7 +24,10 @@ namespace libyuv { extern "C" { #endif -// subsample amount uses a shift. +// Subsample amount uses a shift. +// v is value +// a is amount to add to round up +// s is shift to subsample down #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) static __inline int Abs(int v) { return v >= 0 ? v : -v; @@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y, if (width <= 0 || height == 0) { return -1; } - ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, depth); SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, diff --git a/source/row_neon.cc b/source/row_neon.cc index 0611b9aa..416f112f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q0, %3 \n" - "1: \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q1, d3 \n" - "vmovl.u16 q4, d4 \n" - "vmovl.u16 q2, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vshl.u32 q1, q1, q0 \n" - "vshl.u32 q2, q2, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q1 \n" - "vmovn.u32 d4, q4 \n" - "vmovn.u32 d5, q2 \n" - "vst1.16 {q1}, [%1]! \n" - "vst1.16 {q2}, [%1]! \n" + "vdup.16 q6, %3 \n" + "1: \n" + "vld1.16 {q0, q1}, [%0]! \n" + "vmovl.u16 q2, d0 \n" + "vmovl.u16 q3, d1 \n" + "vmovl.u16 q4, d2 \n" + "vmovl.u16 q5, d3 \n" + "vmul.u32 q2, q2, q6 \n" + "vmul.u32 q3, q3, q6 \n" + "vmul.u32 q4, q4, q6 \n" + "vmul.u32 q5, q5, q6 \n" + "vshrn.u32 d0, q2, #16 \n" + "vshrn.u32 d1, q3, #16 \n" + "vshrn.u32 d2, q4, #16 \n" + "vshrn.u32 d3, q5, #16 \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); } // Use scale to convert lsb formats to msb, depending how many bits there are: diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 41289fe9..088174c8 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "dup v0.8h, %w3 \n" + "dup v6.8h, %w3 \n" "1: \n" - "ldp q1, q2, [%0], #32 \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll v4.4s, v2.4h, #0 \n" + "ldp q0, q1, [%0], #32 \n" + "ushll v2.4s, v0.4h, #0 \n" + "ushll2 v3.4s, v0.8h, #0 \n" + "ushll v4.4s, v1.4h, #0 \n" + "ushll2 v5.4s, v1.8h, #0 \n" "prfm pldl1keep, [%0, 448] \n" - "ushll2 v1.4s, v1.8h, #0 \n" - "ushll2 v2.4s, v2.8h, #0 \n" - "mul v3.4s, v0.4s, v3.4s \n" - "mul v4.4s, v0.4s, v4.4s \n" - "mul v1.4s, v0.4s, v1.4s \n" - "mul v2.4s, v0.4s, v2.4s \n" - "shrn v3.4h, v3.4s, #16 \n" - "shrn v4.4h, v4.4s, #16 \n" - "shrn2 v3.8h, v1.4s, #16 \n" - "shrn2 v4.8h, v2.4s, #16 \n" - "stp q3, q3, [%1], #32 \n" // store 16 pixels + "mul v2.4s, v2.4s, v6.4s \n" + "mul v3.4s, v3.4s, v6.4s \n" + "mul v4.4s, v4.4s, v6.4s \n" + "mul v5.4s, v5.4s, v6.4s \n" + "shrn v0.4h, v2.4s, #16 \n" + "shrn2 v0.8h, v3.4s, #16 \n" + "shrn v1.4h, v4.4s, #16 \n" + "shrn2 v1.8h, v5.4s, #16 \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Use scale to convert lsb formats to msb, depending how many bits there are: |