From a85cc26fde68699f95eceb4ca93c5eb70278787e Mon Sep 17 00:00:00 2001 From: Yuan Tong Date: Wed, 27 Jan 2021 08:55:58 +0800 Subject: Add MergeARGBPlane and SplitARGBPlane These functions convert between planar and interleaved ARGB, optionally fill 255 to alpha / discard alpha. This can help handle YUV(A) with Identity matrix, which is basically planar ARGB. libyuv_unittest --gtest_filter=LibYUVPlanarTest.*ARGBPlane*:LibYUVPlanarTest.*XRGBPlane* R=fbarchard@google.com Change-Id: I522a189b434f490ba1723ce51317727e7c5eb112 Bug: libyuv:877 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2649887 Commit-Queue: Frank Barchard Reviewed-by: Frank Barchard --- source/row_neon.cc | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'source/row_neon.cc') diff --git a/source/row_neon.cc b/source/row_neon.cc index a17899be..3e960a58 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -666,6 +666,113 @@ void MergeRGBRow_NEON(const uint8_t* src_r, ); } +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. +void SplitARGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %5, %5, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "vst1.8 {q3}, [%4]! \n" // store A + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(dst_a), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time +void MergeARGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "vld1.8 {q3}, [%3]! \n" // load A + "subs %5, %5, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. +void SplitXRGBRow_NEON(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%3]! \n" // store B + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%1]! \n" // store R + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time +void MergeXRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 q3, #255 \n" // load A(255) + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q0}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( -- cgit v1.2.3