Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON

MergeRGB and SplitRGB use a register to point to 9 shuffle tables. - fixes an out of registers error with -mcmodel=large InterpolateRow_16To8_NEON improves performance for I210ToI420: On Pixel 4 for 720p x1000 images Was I210ToI420_Opt (608 ms) Now I210ToI420_Opt (336 ms) On Skylake Xeon Was I210ToI420_Opt (259 ms) Now I210ToI420_Opt (209 ms) Bug: libyuv:931, libyuv:930 Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
author: Frank Barchard <fbarchard@google.com> 2022-06-28 16:31:22 -0700
committer: libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> 2022-06-29 00:00:46 +0000
commit: 6900494d90ae095d44405cd4cc3f346971fa69c9 (patch)
tree: 4a77f6084ee5c3f6b6a2d1d2de7587111175f8fe /source/row_neon64.cc
parent: fe4a50df8e2a787e2919a8321dbe1412b94b20c6 (diff)
download: libyuv-6900494d90ae095d44405cd4cc3f346971fa69c9.tar.gz
1 files changed, 80 insertions, 0 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 00adfe3e..a5313b7e 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3031,6 +3031,86 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
 }
 
+// Bilinear filter 8x2 -> 8x1
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int dst_width,
+                               int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+
+  asm volatile(
+      "dup         v6.8h, %w6                    \n"
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.8h, %w4                    \n"
+      "dup         v4.8h, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "umull       v2.4s, v0.4h, v4.4h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull2      v3.4s, v0.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal       v2.4s, v1.4h, v5.4h           \n"
+      "umlal2      v3.4s, v1.8h, v5.8h           \n"
+      "rshrn       v0.4h, v2.4s, #8              \n"
+      "rshrn2      v0.8h, v3.4s, #8              \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "urhadd      v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ldr         q0, [%1], #16                 \n"
+      "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "str         d0, [%0], #8                  \n"  // store 8 pixels
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_ptr1),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(y1_fraction),  // %4
+        "r"(y0_fraction),  // %5
+        "r"(shift)         // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
 void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
author	Frank Barchard <fbarchard@google.com>	2022-06-28 16:31:22 -0700
committer	libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>	2022-06-29 00:00:46 +0000
commit	6900494d90ae095d44405cd4cc3f346971fa69c9 (patch)
tree	4a77f6084ee5c3f6b6a2d1d2de7587111175f8fe /source/row_neon64.cc
parent	fe4a50df8e2a787e2919a8321dbe1412b94b20c6 (diff)
download	libyuv-6900494d90ae095d44405cd4cc3f346971fa69c9.tar.gz