diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-03-20 23:22:08 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-03-31 20:46:02 +0000 |
commit | 8a13626e42f7fdcf3a6acbb0316760ee54cda7d8 (patch) | |
tree | ab34de049749adec1366344b02be882797ace6a4 /source/row_neon.cc | |
parent | 2525698acba9bf9b701ba6b4d9584291a1f62257 (diff) | |
download | libyuv-8a13626e42f7fdcf3a6acbb0316760ee54cda7d8.tar.gz |
Add MergeAR30Plane, MergeAR64Plane, MergeARGB16To8Plane
These functions merge high bit depth planar RGB pixels into packed format.
Change-Id: I506935a164b069e6b2fed8bf152cb874310c0916
Bug: libyuv:886, libyuv:889
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2780468
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon.cc')
-rw-r--r-- | source/row_neon.cc | 264 |
1 files changed, 242 insertions, 22 deletions
diff --git a/source/row_neon.cc b/source/row_neon.cc index 2165d0d0..ce0759f5 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -760,8 +760,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B "subs %4, %4, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,6 +773,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, ); } +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "vmov.u32 q14, #1023 \n" + "vdup.32 q15, %5 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vshl.u32 q2, q2, q15 \n" // 000B + "vshl.u32 q1, q1, q15 \n" + "vshl.u32 q0, q0, q15 \n" + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "vmov.u32 q14, #1023 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // 000B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + "3: \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q14"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vdup.u16 q15, %6 \n" + "vdup.u16 q14, %7 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vmin.u16 q3, q3, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "subs %5, %5, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%4]! \n" + "vst4.16 {d1, d3, d5, d7}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vmov.u8 q3, #0xff \n" // A (0xffff) + "vdup.u16 q15, %5 \n" + "vdup.u16 q14, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "subs %4, %4, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%3]! \n" + "vst4.16 {d1, d3, d5, d7}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "vqmovn.u16 d2, q2 \n" + "vqmovn.u16 d3, q3 \n" + "subs %5, %5, #8 \n" + "vst4.8 {d0, d1, d2, d3}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %5 \n" + "vmov.u8 d6, #0xff \n" // A (0xff) + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vqmovn.u16 d5, q2 \n" + "vqmovn.u16 d4, q1 \n" + "vqmovn.u16 d3, q0 \n" + "subs %4, %4, #8 \n" + "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( |