diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-02-25 15:21:28 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-02-25 23:16:54 +0000 |
commit | a8c181050c202854ae32433164e6bd5d1e7c4368 (patch) | |
tree | c300dbf9bfa59d0dc2772c311b0dfd154e08d54a /source/row_neon.cc | |
parent | 08815a29766a78398a8e2b9ed095280e9d0a73c2 (diff) | |
download | libyuv-a8c181050c202854ae32433164e6bd5d1e7c4368.tar.gz |
Add 10/12 bit YUV To YUV functions
The following functions (and their 12 bit variant) are added:
planar, 10->10:
I410ToI010, I210ToI010
planar, 10->8:
I410ToI444, I210ToI422
planar<->biplanar, 10->10:
I010ToP010, I210ToP210, I410ToP410
P010ToI010, P210ToI210, P410ToI410
R=fbarchard@chromium.org
Change-Id: I9aa2bafa0d6a6e1e38ce4e20cbb437e10f9b0158
Bug: libyuv:834, libyuv:873
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2709822
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r-- | source/row_neon.cc | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc index e54cb12b..43a2cac7 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3166,6 +3166,121 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, : "cc", "memory", "q0", "q1", "q2", "q3"); } +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + asm volatile( + "vdup.32 q0, %3 \n" + "1: \n" + "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV + "vmovl.u16 q3, d2 \n" + "vmovl.u16 q4, d3 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vmovn.u32 d2, q3 \n" + "vmovn.u32 d3, q4 \n" + "vmovl.u16 q3, d4 \n" + "vmovl.u16 q4, d5 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vmovn.u32 d4, q3 \n" + "vmovn.u32 d5, q4 \n" + "subs %4, %4, #8 \n" // 8 src pixels per loop + "vst1.16 {q1}, [%1]! \n" // store 8 U pixels + "vst1.16 {q2}, [%2]! \n" // store 8 V pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(depth), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" // load 8 U + "vld1.16 {q1}, [%1]! \n" // load 8 V + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %4, %4, #8 \n" // 8 src pixels per loop + "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(shift), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q2, %2 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vmul.u16 q0, q0, q2 \n" + "vmul.u16 q1, q1, q2 \n" + "vst1.16 {q0}, [%1]! \n" + "vst1.16 {q1}, [%1]! \n" + "subs %3, %3, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q0, %2 \n" + "1: \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vmovl.u16 q3, d2 \n" + "vmovl.u16 q1, d3 \n" + "vmovl.u16 q4, d4 \n" + "vmovl.u16 q2, d5 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vshl.u32 q1, q1, q0 \n" + "vshl.u32 q2, q2, q0 \n" + "vmovn.u32 d2, q3 \n" + "vmovn.u32 d3, q1 \n" + "vmovn.u32 d4, q4 \n" + "vmovn.u32 d5, q2 \n" + "vst1.16 {q1}, [%1]! \n" + "vst1.16 {q2}, [%1]! \n" + "subs %3, %3, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus |