diff options
-rw-r--r-- | include/libyuv/scale_row.h | 2 | ||||
-rw-r--r-- | source/scale_argb.cc | 5 | ||||
-rw-r--r-- | source/scale_neon.cc | 29 | ||||
-rw-r--r-- | source/scale_neon64.cc | 27 |
4 files changed, 61 insertions, 2 deletions
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 63ea8c84..9660d1d9 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -259,6 +259,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); diff --git a/source/scale_argb.cc b/source/scale_argb.cc index d69ecdc8..83713d55 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -61,8 +61,9 @@ static void ScaleARGBDown2(int src_width, int src_height, #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : - ScaleARGBRowDown2_NEON; + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : + ScaleARGBRowDown2Box_NEON); } #endif diff --git a/source/scale_neon.cc b/source/scale_neon.cc index bcfe6ab7..89d32756 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -698,6 +698,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + "vrshrn.u16 d2, q2, #1 \n" + "vrshrn.u16 d3, q3, #1 \n" + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index c9b6416d..413d005a 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -701,6 +701,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS (0) + // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #1 \n" + "rshrn v2.8b, v2.8h, #1 \n" + "rshrn v3.8b, v3.8h, #1 \n" + MEMACCESS (1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( |