aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc120
1 files changed, 120 insertions, 0 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index acefd96d..941c9b98 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3526,6 +3526,126 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
: "cc", "memory", "v0", "v1", "v2", "v3");
}
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ asm volatile(
+ "dup v0.4s, %w3 \n"
+ "1: \n"
+ "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushll v3.4s, v1.4h, #0 \n"
+ "ushll2 v4.4s, v1.8h, #0 \n"
+ "ushl v3.4s, v3.4s, v0.4s \n"
+ "ushl v4.4s, v4.4s, v0.4s \n"
+ "xtn v1.4h, v3.4s \n"
+ "xtn2 v1.8h, v4.4s \n"
+ "ushll v3.4s, v2.4h, #0 \n"
+ "ushll2 v4.4s, v2.8h, #0 \n"
+ "ushl v3.4s, v3.4s, v0.4s \n"
+ "ushl v4.4s, v4.4s, v0.4s \n"
+ "xtn v2.4h, v3.4s \n"
+ "xtn2 v2.8h, v4.4s \n"
+ "subs %w4, %w4, #8 \n" // 8 src pixels per loop
+ "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels
+ "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(depth), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v2.8h, %w3 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "prfm pldl1keep, [%1, 448] \n"
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "subs %w4, %w4, #8 \n" // 8 src pixels per loop
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(shift), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v2.8h, %w2 \n"
+ "1: \n"
+ "ldp q0, q1, [%0] \n"
+ "add %0, %0, #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mul v0.8h, v0.8h, v2.8h \n"
+ "mul v1.8h, v1.8h, v2.8h \n"
+ "stp q0, q1, [%1] \n" // store 16 pixels
+ "add %1, %1, #32 \n"
+ "subs %w3, %w3, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(scale), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v0.8h, %w2 \n"
+ "1: \n"
+ "ldp q1, q2, [%0] \n"
+ "add %0, %0, #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushll v3.4s, v1.4h, #0 \n"
+ "ushll v4.4s, v2.4h, #0 \n"
+ "ushll2 v1.4s, v1.8h, #0 \n"
+ "ushll2 v2.4s, v2.8h, #0 \n"
+ "mul v3.4s, v0.4s, v3.4s \n"
+ "mul v4.4s, v0.4s, v4.4s \n"
+ "mul v1.4s, v0.4s, v1.4s \n"
+ "mul v2.4s, v0.4s, v2.4s \n"
+ "shrn v3.4h, v3.4s, #16 \n"
+ "shrn v4.4h, v4.4s, #16 \n"
+ "shrn2 v3.8h, v1.4s, #16 \n"
+ "shrn2 v4.8h, v2.4s, #16 \n"
+ "stp q3, q3, [%1] \n" // store 16 pixels
+ "add %1, %1, #32 \n"
+ "subs %w3, %w3, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(scale), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus