aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon.cc
diff options
context:
space:
mode:
authorYuan Tong <tongyuan200097@gmail.com>2021-01-27 08:55:58 +0800
committerFrank Barchard <fbarchard@chromium.org>2021-01-27 19:33:51 +0000
commita85cc26fde68699f95eceb4ca93c5eb70278787e (patch)
tree20b7fc40b2d60c528e69fae9c9d6f70b0ae1ed67 /source/row_neon.cc
parentf7c0a73a3ea796e9b1c2a4cf2de319eafe6d1226 (diff)
downloadlibyuv-a85cc26fde68699f95eceb4ca93c5eb70278787e.tar.gz
Add MergeARGBPlane and SplitARGBPlane
These functions convert between planar and interleaved ARGB, optionally fill 255 to alpha / discard alpha. This can help handle YUV(A) with Identity matrix, which is basically planar ARGB. libyuv_unittest --gtest_filter=LibYUVPlanarTest.*ARGBPlane*:LibYUVPlanarTest.*XRGBPlane* R=fbarchard@google.com Change-Id: I522a189b434f490ba1723ce51317727e7c5eb112 Bug: libyuv:877 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2649887 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r--source/row_neon.cc107
1 files changed, 107 insertions, 0 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc
index a17899be..3e960a58 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -666,6 +666,113 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
);
}
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "vst1.8 {q3}, [%4]! \n" // store A
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "vld1.8 {q3}, [%3]! \n" // load A
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 q3, #255 \n" // load A(255)
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(