1 files changed, 130 insertions, 49 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index a341dc13..a5c24e84 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2198,19 +2198,26 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "movi        v24.8b, #112                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
-      "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
-      "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
-      "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
-      "movi        v29.16b,#0x80                 \n"  // 128.5
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "movi        v29.16b, #0x80                \n"  // 128.5
+
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@@ -2229,15 +2236,53 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
         "v27", "v28", "v29");
 }
 
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
 #define RGBTOUV_SETUP_REG                                                  \
   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
@@ -2943,34 +2988,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
-  uint16_t pad;
 };
 
-// RGB to JPeg coefficients
-// B * 0.1140 coefficient = 29
-// G * 0.5870 coefficient = 150
-// R * 0.2990 coefficient = 77
-// Add 0.5 = 0x80
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
-
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
-
-// RGB to BT.601 coefficients
-// B * 0.1016 coefficient = 25
-// G * 0.5078 coefficient = 129
-// R * 0.2578 coefficient = 66
-// Add 16.5 = 0x1080
-
-static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
-
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
-
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_y,
@@ -3005,6 +3024,26 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
         "v17");
 }
 
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
 }
@@ -3402,24 +3441,26 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
   asm volatile(
+      "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
+
       // Attenuate 8 pixels.
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
       "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
-      "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
-      "uqrshrn     v0.8b, v4.8h, #8              \n"         // b >>= 8
-      "uqrshrn     v1.8b, v5.8h, #8              \n"         // g >>= 8
-      "uqrshrn     v2.8b, v6.8h, #8              \n"         // r >>= 8
+      "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
+      "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
+      "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
+      "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
+      "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
       :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
@@ -3980,6 +4021,46 @@ void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+// Convert FP16 Half Floats to FP32 Floats
+// Read a column and write a row
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width) {
+  asm volatile(
+      "cmp         %w2, #8                       \n"  // Is there 8 rows?
+      "b.lo        2f                            \n"
+      "1:                                        \n"
+      "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
+      "ld1         {v0.h}[1], [%0], %3           \n"
+      "ld1         {v0.h}[2], [%0], %3           \n"
+      "ld1         {v0.h}[3], [%0], %3           \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"
+      "ld1         {v1.h}[1], [%0], %3           \n"
+      "ld1         {v1.h}[2], [%0], %3           \n"
+      "ld1         {v1.h}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
+      "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      "cmp         %w2, #1                       \n"  // Is there 1 value?
+      "b.lo        3f                            \n"
+      "2:                                        \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
+      "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
+      "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
+      "str         s2, [%1], #4                  \n"  // store 1 floats
+      "b.gt        2b                            \n"
+      "3:                                        \n"
+      : "+r"(src),                        // %0
+        "+r"(dst),                        // %1
+        "+r"(width)                       // %2
+      : "r"((ptrdiff_t)(src_stride * 2))  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
 // Convert FP32 Floats to FP16 Half Floats
 void ConvertFP32ToFP16Row_NEON(const float* src,
                                uint16_t* dst,  // fp16