aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2023-01-17 13:06:38 -0800
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-01-17 21:40:45 +0000
commit541d8efbaf0319b1e068d9c31f6dbc653d5c72b8 (patch)
tree51cb0f426435b7b3d9f5553144300772806053e8 /source/row_neon64.cc
parentd5aa3d4a76930cde9527f5be5bc89524b48fc069 (diff)
downloadlibyuv-541d8efbaf0319b1e068d9c31f6dbc653d5c72b8.tar.gz
Fix for divide row functions used by P010ToI010
Bug: libyuv:951 Change-Id: Id323656cb6f99b1be0be7aaa854d3cc15feeba69 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166562 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc32
1 files changed, 16 insertions, 16 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 41289fe9..088174c8 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
- "dup v0.8h, %w3 \n"
+ "dup v6.8h, %w3 \n"
"1: \n"
- "ldp q1, q2, [%0], #32 \n"
- "ushll v3.4s, v1.4h, #0 \n"
- "ushll v4.4s, v2.4h, #0 \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "ushll v2.4s, v0.4h, #0 \n"
+ "ushll2 v3.4s, v0.8h, #0 \n"
+ "ushll v4.4s, v1.4h, #0 \n"
+ "ushll2 v5.4s, v1.8h, #0 \n"
"prfm pldl1keep, [%0, 448] \n"
- "ushll2 v1.4s, v1.8h, #0 \n"
- "ushll2 v2.4s, v2.8h, #0 \n"
- "mul v3.4s, v0.4s, v3.4s \n"
- "mul v4.4s, v0.4s, v4.4s \n"
- "mul v1.4s, v0.4s, v1.4s \n"
- "mul v2.4s, v0.4s, v2.4s \n"
- "shrn v3.4h, v3.4s, #16 \n"
- "shrn v4.4h, v4.4s, #16 \n"
- "shrn2 v3.8h, v1.4s, #16 \n"
- "shrn2 v4.8h, v2.4s, #16 \n"
- "stp q3, q3, [%1], #32 \n" // store 16 pixels
+ "mul v2.4s, v2.4s, v6.4s \n"
+ "mul v3.4s, v3.4s, v6.4s \n"
+ "mul v4.4s, v4.4s, v6.4s \n"
+ "mul v5.4s, v5.4s, v6.4s \n"
+ "shrn v0.4h, v2.4s, #16 \n"
+ "shrn2 v0.8h, v3.4s, #16 \n"
+ "shrn v1.4h, v4.4s, #16 \n"
+ "shrn2 v1.8h, v5.4s, #16 \n"
+ "stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// Use scale to convert lsb formats to msb, depending how many bits there are: