aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon.cc
diff options
context:
space:
mode:
authorJustin Green <greenjustin@google.com>2023-03-14 10:23:17 -0400
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-03-14 14:59:26 +0000
commit76468711d5c8302431a900499ff73d34fdfc146b (patch)
treea91e3f09e5b124c688eb809bb0b288db42eb8d46 /source/row_neon.cc
parentf9b23b9cc0ca3bd27b9acc07ea0450cd5097175d (diff)
downloadlibyuv-76468711d5c8302431a900499ff73d34fdfc146b.tar.gz
M2T2 Unpack fixes
Fix the algorithm for unpacking the lower 2 bits of M2T2 pixels. Bug: b:258474032 Change-Id: Iea1d63f26e3f127a70ead26bc04ea3d939e793e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4337978 Commit-Queue: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r--source/row_neon.cc86
1 files changed, 36 insertions, 50 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 37f6db0c..59b3e05a 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -721,57 +721,43 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
- const uint8_t* src_lower_bits = src;
- const uint8_t* src_upper_bits = src + 16;
- asm volatile(
- "1: \n"
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Load 32 bytes of upper
- // bits.
- "vld1.8 {d6}, [%0]! \n" // Load 8 bytes of lower
- // bits.
- "vshl.u8 d4, d6, #2 \n" // Align lower bits.
- "vshl.u8 d2, d6, #4 \n"
- "vshl.u8 d0, d6, #6 \n"
- "vzip.u8 d0, d1 \n" // Zip lower and upper
- // bits together.
- "vzip.u8 d2, d3 \n"
- "vzip.u8 d4, d5 \n"
- "vzip.u8 d6, d7 \n"
- "vsri.u16 q0, q0, #10 \n" // Copy upper 6 bits into
- // lower 6 bits for better
- // accuracy in
- // conversions.
- "vsri.u16 q1, q1, #10 \n"
- "vsri.u16 q2, q2, #10 \n"
- "vsri.u16 q3, q3, #10 \n"
- "vst4.16 {d0, d2, d4, d6}, [%2]! \n" // Store 32 pixels
- "vst4.16 {d1, d3, d5, d7}, [%2]! \n"
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Process last 32 pixels
- // in the block
- "vld1.8 {d6}, [%0]! \n"
- "vshl.u8 d4, d6, #2 \n"
- "vshl.u8 d2, d6, #4 \n"
- "vshl.u8 d0, d6, #6 \n"
- "vzip.u8 d0, d1 \n"
- "vzip.u8 d2, d3 \n"
- "vzip.u8 d4, d5 \n"
- "vzip.u8 d6, d7 \n"
- "vsri.u16 q0, q0, #10 \n"
- "vsri.u16 q1, q1, #10 \n"
- "vsri.u16 q2, q2, #10 \n"
- "vsri.u16 q3, q3, #10 \n"
- "vst4.16 {d0, d2, d4, d6}, [%2]! \n"
- "vst4.16 {d1, d3, d5, d7}, [%2]! \n"
- "mov %0, %1 \n"
- "add %1, %0, #16 \n"
- "subs %3, %3, #80 \n"
- "bgt 1b \n"
- : "+r"(src_lower_bits), // %0
- "+r"(src_upper_bits), // %1
- "+r"(dst), // %2
- "+r"(size) // %3
+ asm volatile(
+ "1: \n"
+ "vld1.8 q14, [%0]! \n" // Load lower bits.
+ "vld1.8 q9, [%0]! \n" // Load upper bits row
+ // by row.
+ "vld1.8 q11, [%0]! \n"
+ "vld1.8 q13, [%0]! \n"
+ "vld1.8 q15, [%0]! \n"
+ "vshl.u8 q8, q14, #6 \n" // Shift lower bit data
+ // appropriately.
+ "vshl.u8 q10, q14, #4 \n"
+ "vshl.u8 q12, q14, #2 \n"
+ "vzip.u8 q8, q9 \n" // Interleave upper and
+ // lower bits.
+ "vzip.u8 q10, q11 \n"
+ "vzip.u8 q12, q13 \n"
+ "vzip.u8 q14, q15 \n"
+ "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
+ // into lower 6 bits for
+ // better accuracy in
+ // conversions.
+ "vsri.u16 q9, q9, #10 \n"
+ "vsri.u16 q10, q10, #10 \n"
+ "vsri.u16 q11, q11, #10 \n"
+ "vsri.u16 q12, q12, #10 \n"
+ "vsri.u16 q13, q13, #10 \n"
+ "vsri.u16 q14, q14, #10 \n"
+ "vsri.u16 q15, q15, #10 \n"
+ "vstmia %1!, {q8-q15} \n" // Store pixel block (64
+ // pixels).
+ "subs %2, %2, #80 \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(size) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q3");
+ : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 U's and V's and writes out 16 pairs of UV.