diff options
author | Frank Barchard <fbarchard@google.com> | 2022-06-28 16:31:22 -0700 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2022-06-29 00:00:46 +0000 |
commit | 6900494d90ae095d44405cd4cc3f346971fa69c9 (patch) | |
tree | 4a77f6084ee5c3f6b6a2d1d2de7587111175f8fe /source/row_neon64.cc | |
parent | fe4a50df8e2a787e2919a8321dbe1412b94b20c6 (diff) | |
download | libyuv-6900494d90ae095d44405cd4cc3f346971fa69c9.tar.gz |
Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON
MergeRGB and SplitRGB use a register to point to 9 shuffle tables.
- fixes an out of registers error with -mcmodel=large
InterpolateRow_16To8_NEON improves performance for I210ToI420:
On Pixel 4 for 720p x1000 images
Was I210ToI420_Opt (608 ms)
Now I210ToI420_Opt (336 ms)
On Skylake Xeon
Was I210ToI420_Opt (259 ms)
Now I210ToI420_Opt (209 ms)
Bug: libyuv:931, libyuv:930
Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 00adfe3e..a5313b7e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3031,6 +3031,86 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } +// Bilinear filter 8x2 -> 8x1 +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + + asm volatile( + "dup v6.8h, %w6 \n" + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + + "dup v5.8h, %w4 \n" + "dup v4.8h, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "umull v2.4s, v0.4h, v4.4h \n" + "prfm pldl1keep, [%1, 448] \n" + "umull2 v3.4s, v0.8h, v4.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umlal v2.4s, v1.4h, v5.4h \n" + "umlal2 v3.4s, v1.8h, v5.8h \n" + "rshrn v0.4h, v2.4s, #8 \n" + "rshrn2 v0.8h, v3.4s, #8 \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 1b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + "ld1 {v0.8h}, [%1], #16 \n" + "ld1 {v1.8h}, [%2], #16 \n" + "subs %w3, %w3, #8 \n" + "prfm pldl1keep, [%1, 448] \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v0.8h, v0.8h, v6.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%0], #8 \n" + "b.gt 50b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ldr q0, [%1], #16 \n" + "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative + "prfm pldl1keep, [%1, 448] \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "str d0, [%0], #8 \n" // store 8 pixels + "b.gt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width) // %3 + : "r"(y1_fraction), // %4 + "r"(y0_fraction), // %5 + "r"(shift) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); +} + // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, |