diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-01-27 08:55:58 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-01-27 19:33:51 +0000 |
commit | a85cc26fde68699f95eceb4ca93c5eb70278787e (patch) | |
tree | 20b7fc40b2d60c528e69fae9c9d6f70b0ae1ed67 /source/row_neon.cc | |
parent | f7c0a73a3ea796e9b1c2a4cf2de319eafe6d1226 (diff) | |
download | libyuv-a85cc26fde68699f95eceb4ca93c5eb70278787e.tar.gz |
Add MergeARGBPlane and SplitARGBPlane
These functions convert between planar and interleaved ARGB,
optionally fill 255 to alpha / discard alpha.
This can help handle YUV(A) with Identity matrix, which is
basically planar ARGB.
libyuv_unittest --gtest_filter=LibYUVPlanarTest.*ARGBPlane*:LibYUVPlanarTest.*XRGBPlane*
R=fbarchard@google.com
Change-Id: I522a189b434f490ba1723ce51317727e7c5eb112
Bug: libyuv:877
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2649887
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r-- | source/row_neon.cc | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc index a17899be..3e960a58 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -666,6 +666,113 @@ void MergeRGBRow_NEON(const uint8_t* src_r, ); } +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %5, %5, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "vst1.8 {q3}, [%4]! \n" // store A + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "vld1.8 {q3}, [%3]! \n" // load A + "subs %5, %5, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 q3, #255 \n" // load A(255) + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( |