diff options
author | Frank Barchard <fbarchard@google.com> | 2019-04-12 10:20:44 -0700 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2019-04-12 17:48:45 +0000 |
commit | 413a8d8041f1cc5a350a47c0d81cc721e64f9fd0 (patch) | |
tree | 9e7f16a0109fd129a1eb58b4967c5a6e8ff3a8a5 /source/row_neon64.cc | |
parent | 4bd08cbc0e45fb434a1d1245004c7b922ed91beb (diff) | |
download | libyuv-413a8d8041f1cc5a350a47c0d81cc721e64f9fd0.tar.gz |
Add AYUVToNV12 and NV21ToNV12
BUG=libyuv:832
TESTED=out/Release/libyuv_unittest --gtest_filter=*ToNV12* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
R=rrwinterton@gmail.com
Change-Id: Id03b4613211fb6a6e163d10daa7c692fe31e36d8
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1560080
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 199 |
1 files changed, 122 insertions, 77 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 5d045f64..449c9f39 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. @@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. @@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time @@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } // Copy multiple of 32. @@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "+r"(width) // %2 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // SetRow writes 'width' bytes using an 8 bit value repeated. @@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { @@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); + ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { @@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } #define RGB565TOARGB \ @@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); + ); } #define ARGB1555TOARGB \ @@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } #define ARGB4444TOARGB \ @@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void ARGBToRGB24Row_NEON(const uint8_t* src_argb, @@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { @@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); + ); } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { @@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { @@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, @@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, @@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, @@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List - ); + ); } void UYVYToUVRow_NEON(const uint8_t* src_uyvy, @@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List - ); + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. @@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } void I422ToYUY2Row_NEON(const uint8_t* src_y, @@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); + ); } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { @@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, : "r"(2LL), // %5 "r"(6LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } // SobelY as a matrix is @@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, : "r"(1LL), // %4 "r"(6LL) // %5 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } // Caveat - rounds float to half float whereas scaling version truncates. @@ -2879,23 +2879,51 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile ( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2"); + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void AYUVToVURow_NEON(const uint8_t* src_ayuv, @@ -2905,40 +2933,41 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } void FloatDivToByteRow_NEON(const float* src_weights, @@ -2962,7 +2991,7 @@ void FloatDivToByteRow_NEON(const float* src_weights, "uqxtn v1.4h, v1.4s \n" // 8 shorts "uqxtn2 v1.8h, v2.4s \n" "uqxtn v1.8b, v1.8h \n" // 8 bytes - + "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero @@ -2974,15 +3003,31 @@ void FloatDivToByteRow_NEON(const float* src_weights, "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask "b.gt 1b \n" - : "+r"(src_weights), // %0 - "+r"(src_values), // %1 - "+r"(dst_out), // %2 - "+r"(dst_mask), // %3 - "+r"(width) // %4 + : "+r"(src_weights), // %0 + "+r"(src_values), // %1 + "+r"(dst_out), // %2 + "+r"(dst_mask), // %3 + "+r"(width) // %4 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); } +// Convert biplanar UV channel of NV12 to NV21 +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values + "orr v2.16b, v0.16b, v0.16b \n" // move U after V + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus |