aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/libyuv/scale_row.h2
-rw-r--r--source/scale_argb.cc5
-rw-r--r--source/scale_neon.cc29
-rw-r--r--source/scale_neon64.cc27
4 files changed, 61 insertions, 2 deletions
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 63ea8c84..9660d1d9 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -259,6 +259,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index d69ecdc8..83713d55 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -61,8 +61,9 @@ static void ScaleARGBDown2(int src_width, int src_height,
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
- ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
- ScaleARGBRowDown2_NEON;
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+ ScaleARGBRowDown2Box_NEON);
}
#endif
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index bcfe6ab7..89d32756 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -698,6 +698,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #1 \n"
+ "vrshrn.u16 d2, q2, #1 \n"
+ "vrshrn.u16 d3, q3, #1 \n"
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index c9b6416d..413d005a 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -701,6 +701,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS (0)
+ // load 8 ARGB pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
+ "rshrn v1.8b, v1.8h, #1 \n"
+ "rshrn v2.8b, v2.8h, #1 \n"
+ "rshrn v3.8b, v3.8h, #1 \n"
+ MEMACCESS (1)
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (