aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon.cc
diff options
context:
space:
mode:
authorJustin Green <greenjustin@google.com>2022-02-03 11:46:44 -0500
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2022-02-03 17:01:49 +0000
commitb4ddbaf549a1bf5572bf703fd2862d1eb7380c6a (patch)
tree9e0a90646de7b1c50f40e3aeb452f749d297561b /source/row_neon.cc
parent804980bbab748fd0e180cd6e7d9292ff49baf704 (diff)
downloadlibyuv-b4ddbaf549a1bf5572bf703fd2862d1eb7380c6a.tar.gz
Add support for MM21.
Add support for MM21 to NV12 and I420 conversion, and add SIMD optimizations for arm, aarch64, SSE2, and SSSE3 machines. Bug: libyuv:915, b/215425056 Change-Id: Iecb0c33287f35766a6169d4adf3b7397f1ba8b5d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3433269 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Justin Green <greenjustin@google.com>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r--source/row_neon.cc46
1 files changed, 46 insertions, 0 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 543ebec9..4781e2f6 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -575,6 +575,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
);
}
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "pld [%0, 1792] \n"
+ "vst1.16 {q0}, [%1]! \n" // store 16 bytes
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "q0" // Clobber List
+ );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], %4 \n"
+ "subs %3, %3, #16 \n"
+ "pld [%0, 1792] \n"
+ "vst1.8 {d0}, [%1]! \n"
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(src_tile_stride) // %4
+ : "cc", "memory", "d0", "d1" // Clobber List
+ );
+}
+
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,