Add AYUVToNV12 and NV21ToNV12

BUG=libyuv:832 TESTED=out/Release/libyuv_unittest --gtest_filter=*ToNV12* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 R=rrwinterton@gmail.com Change-Id: Id03b4613211fb6a6e163d10daa7c692fe31e36d8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1560080 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
author: Frank Barchard <fbarchard@google.com> 2019-04-12 10:20:44 -0700
committer: Commit Bot <commit-bot@chromium.org> 2019-04-12 17:48:45 +0000
commit: 413a8d8041f1cc5a350a47c0d81cc721e64f9fd0 (patch)
tree: 9e7f16a0109fd129a1eb58b4967c5a6e8ff3a8a5 /source/row_neon64.cc
parent: 4bd08cbc0e45fb434a1d1245004c7b922ed91beb (diff)
download: libyuv-413a8d8041f1cc5a350a47c0d81cc721e64f9fd0.tar.gz
1 files changed, 122 insertions, 77 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 5d045f64..449c9f39 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.
@@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
         "+r"(width)       // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
         "+r"(width)        // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
         "v28"
 
-      );
+  );
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2LL),                               // %5
         "r"(6LL)                                // %6
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1LL),                               // %4
         "r"(6LL)                                // %5
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Caveat - rounds float to half float whereas scaling version truncates.
@@ -2879,23 +2879,51 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
-                          uint8_t* dst_yuv24,
-                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    "ld1        {v2.16b}, [%0], #16            \n"  // load 16 Y values
-    "ld2        {v0.8b, v1.8b}, [%1], #16      \n"  // load 8 VU values
-    "zip1       v0.16b, v0.16b, v0.16b         \n"  // replicate V values
-    "zip1       v1.16b, v1.16b, v1.16b         \n"  // replicate U values
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels per loop
-    "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
-    "b.gt       1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_vu),     // %1
-      "+r"(dst_yuv24),  // %2
-      "+r"(width)       // %3
-    :
-    : "cc", "memory", "v0", "v1", "v2");
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                          \n"
+      "ld1        {v2.16b}, [%0], #16            \n"     // load 16 Y values
+      "ld2        {v0.8b, v1.8b}, [%1], #16      \n"     // load 8 VU values
+      "zip1       v0.16b, v0.16b, v0.16b         \n"     // replicate V values
+      "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
+      "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels.
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v2.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
@@ -2905,40 +2933,41 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
   asm volatile(
 
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
-    "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
-    "uqrshrn    v1.8b, v1.8h, #2               \n"
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
-    "b.gt       1b                             \n"
-  : "+r"(src_ayuv),  // %0
-    "+r"(src_ayuv_1),  // %1
-    "+r"(dst_vu),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels.
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Copy row of AYUV Y's into Y
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels
-    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-    "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
-    "b.gt       1b                             \n"
-    : "+r"(src_ayuv),   // %0
-      "+r"(dst_y),      // %1
-      "+r"(width)       // %2
-    :
-    : "cc", "memory", "v0", "v1", "v2", "v3");
+  asm volatile(
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 void FloatDivToByteRow_NEON(const float* src_weights,
@@ -2962,7 +2991,7 @@ void FloatDivToByteRow_NEON(const float* src_weights,
       "uqxtn      v1.4h, v1.4s                   \n"  // 8 shorts
       "uqxtn2     v1.8h, v2.4s                   \n"
       "uqxtn      v1.8b, v1.8h                   \n"  // 8 bytes
- 
+
       "st1        {v1.8b}, [%2], #8              \n"  // store 8 byte out
 
       "fcmgt      v5.4s, v1.4s, v0.4s            \n"  // cmp weight to zero
@@ -2974,15 +3003,31 @@ void FloatDivToByteRow_NEON(const float* src_weights,
       "st1        {v5.8b}, [%3], #8              \n"  // store 8 byte mask
 
       "b.gt       1b                             \n"
-      : "+r"(src_weights), // %0
-        "+r"(src_values),  // %1
-        "+r"(dst_out),     // %2
-        "+r"(dst_mask),    // %3
-        "+r"(width)        // %4
+      : "+r"(src_weights),  // %0
+        "+r"(src_values),   // %1
+        "+r"(dst_out),      // %2
+        "+r"(dst_mask),     // %3
+        "+r"(width)         // %4
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                          \n"
+      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
+      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
author	Frank Barchard <fbarchard@google.com>	2019-04-12 10:20:44 -0700
committer	Commit Bot <commit-bot@chromium.org>	2019-04-12 17:48:45 +0000
commit	413a8d8041f1cc5a350a47c0d81cc721e64f9fd0 (patch)
tree	9e7f16a0109fd129a1eb58b4967c5a6e8ff3a8a5 /source/row_neon64.cc
parent	4bd08cbc0e45fb434a1d1245004c7b922ed91beb (diff)
download	libyuv-413a8d8041f1cc5a350a47c0d81cc721e64f9fd0.tar.gz