aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--source/convert.cc6
-rw-r--r--source/row_neon.cc36
-rw-r--r--source/row_neon64.cc32
3 files changed, 37 insertions, 37 deletions
diff --git a/source/convert.cc b/source/convert.cc
index a41974a2..9a263536 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -24,7 +24,10 @@ namespace libyuv {
extern "C" {
#endif
-// subsample amount uses a shift.
+// Subsample amount uses a shift.
+// v is value
+// a is amount to add to round up
+// s is shift to subsample down
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
@@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
if (width <= 0 || height == 0) {
return -1;
}
-
ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
depth);
SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 0611b9aa..416f112f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
- "vdup.16 q0, %3 \n"
- "1: \n"
- "vld1.16 {q1}, [%0]! \n"
- "vld1.16 {q2}, [%0]! \n"
- "vmovl.u16 q3, d2 \n"
- "vmovl.u16 q1, d3 \n"
- "vmovl.u16 q4, d4 \n"
- "vmovl.u16 q2, d5 \n"
- "vshl.u32 q3, q3, q0 \n"
- "vshl.u32 q4, q4, q0 \n"
- "vshl.u32 q1, q1, q0 \n"
- "vshl.u32 q2, q2, q0 \n"
- "vmovn.u32 d2, q3 \n"
- "vmovn.u32 d3, q1 \n"
- "vmovn.u32 d4, q4 \n"
- "vmovn.u32 d5, q2 \n"
- "vst1.16 {q1}, [%1]! \n"
- "vst1.16 {q2}, [%1]! \n"
+ "vdup.16 q6, %3 \n"
+ "1: \n"
+ "vld1.16 {q0, q1}, [%0]! \n"
+ "vmovl.u16 q2, d0 \n"
+ "vmovl.u16 q3, d1 \n"
+ "vmovl.u16 q4, d2 \n"
+ "vmovl.u16 q5, d3 \n"
+ "vmul.u32 q2, q2, q6 \n"
+ "vmul.u32 q3, q3, q6 \n"
+ "vmul.u32 q4, q4, q6 \n"
+ "vmul.u32 q5, q5, q6 \n"
+ "vshrn.u32 d0, q2, #16 \n"
+ "vshrn.u32 d1, q3, #16 \n"
+ "vshrn.u32 d2, q4, #16 \n"
+ "vshrn.u32 d3, q5, #16 \n"
+ "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 41289fe9..088174c8 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
- "dup v0.8h, %w3 \n"
+ "dup v6.8h, %w3 \n"
"1: \n"
- "ldp q1, q2, [%0], #32 \n"
- "ushll v3.4s, v1.4h, #0 \n"
- "ushll v4.4s, v2.4h, #0 \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "ushll v2.4s, v0.4h, #0 \n"
+ "ushll2 v3.4s, v0.8h, #0 \n"
+ "ushll v4.4s, v1.4h, #0 \n"
+ "ushll2 v5.4s, v1.8h, #0 \n"
"prfm pldl1keep, [%0, 448] \n"
- "ushll2 v1.4s, v1.8h, #0 \n"
- "ushll2 v2.4s, v2.8h, #0 \n"
- "mul v3.4s, v0.4s, v3.4s \n"
- "mul v4.4s, v0.4s, v4.4s \n"
- "mul v1.4s, v0.4s, v1.4s \n"
- "mul v2.4s, v0.4s, v2.4s \n"
- "shrn v3.4h, v3.4s, #16 \n"
- "shrn v4.4h, v4.4s, #16 \n"
- "shrn2 v3.8h, v1.4s, #16 \n"
- "shrn2 v4.8h, v2.4s, #16 \n"
- "stp q3, q3, [%1], #32 \n" // store 16 pixels
+ "mul v2.4s, v2.4s, v6.4s \n"
+ "mul v3.4s, v3.4s, v6.4s \n"
+ "mul v4.4s, v4.4s, v6.4s \n"
+ "mul v5.4s, v5.4s, v6.4s \n"
+ "shrn v0.4h, v2.4s, #16 \n"
+ "shrn2 v0.8h, v3.4s, #16 \n"
+ "shrn v1.4h, v4.4s, #16 \n"
+ "shrn2 v1.8h, v5.4s, #16 \n"
+ "stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// Use scale to convert lsb formats to msb, depending how many bits there are: