aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-12-09 13:44:17 -0800
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2021-12-10 01:00:07 +0000
commitd7a2d5da87ebdb4e606858dfdb94141bd5aae8c5 (patch)
tree33c2065ff4b3e479f1165a143d26e3fe28433cbf /source/row_neon64.cc
parentc5d48a11f9875f6d6aa30375fd66465971ef2327 (diff)
downloadlibyuv-d7a2d5da87ebdb4e606858dfdb94141bd5aae8c5.tar.gz
J400ToARGB optimized for Exynos using ZIP+ST1
Bug: 204562143 Change-Id: I56c98198c02bd0dd1283f1c14837730c92832c39 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3328702 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc274
1 files changed, 211 insertions, 63 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index b781bda3..69e9fac8 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -15,7 +15,8 @@ namespace libyuv {
extern "C" {
#endif
-// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them.
+// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
+// STn over ZIP1+ST1
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
// This module is for GCC Neon armv8 64 bit.
@@ -385,6 +386,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19");
}
+#if LIBYUV_USE_ST4
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
"movi v23.8b, #255 \n"
@@ -402,6 +404,27 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
:
: "cc", "memory", "v20", "v21", "v22", "v23");
}
+#else
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #255 \n"
+ "1: \n"
+ "ldr d16, [%0], #8 \n"
+ "subs %w2, %w2, #8 \n"
+ "zip1 v18.16b, v16.16b, v16.16b \n" // YY
+ "zip1 v19.16b, v16.16b, v20.16b \n" // YA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
+ "zip2 v17.16b, v18.16b, v19.16b \n"
+ "stp q16, q17, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
+}
+#endif // LIBYUV_USE_ST4
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
@@ -581,6 +604,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
);
}
+#if LIBYUV_USE_ST2
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
@@ -604,6 +628,86 @@ void MergeUVRow_NEON(const uint8_t* src_u,
);
}
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v2.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "zip1 v2.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip2 v3.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v4.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v4.8h \n"
+ "ushl v1.8h, v1.8h, v4.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v2.8h, v0.8h, v1.8h \n"
+ "zip2 v3.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
+}
+#endif // LIBYUV_USE_ST2
+
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
@@ -684,6 +788,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
);
}
+#if LIBYUV_USE_ST4
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
@@ -693,9 +798,9 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "ld1 {v2.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v3.16b}, [%3], #16 \n" // load A
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
@@ -714,6 +819,47 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#else
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%2, 448] \n"
+ "zip1 v4.16b, v0.16b, v1.16b \n" // BG
+ "zip1 v5.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip2 v6.16b, v0.16b, v1.16b \n" // BG
+ "zip2 v7.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
+ "zip2 v1.8h, v4.8h, v5.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "zip1 v2.8h, v6.8h, v7.8h \n"
+ "zip2 v3.8h, v6.8h, v7.8h \n"
+ "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+#endif // LIBYUV_USE_ST4
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
@@ -1706,6 +1852,32 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
:
: "cc", "memory", "v0", "v1", "v2", "v3");
}
+
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q2, [%0], #32 \n" // load 8 pixels
+ "tbl v0.16b, {v0.16b}, v4.16b \n"
+ "tbl v2.16b, {v2.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mov v1.16b, v0.16b \n"
+ "mov v3.16b, v2.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
+ "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToABGR) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
#else
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
@@ -1719,7 +1891,7 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
"prfm pldl1keep, [%0, 448] \n"
"zip1 v4.16b, v1.16b, v1.16b \n"
"zip2 v5.16b, v1.16b, v1.16b \n"
- "st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
@@ -1727,33 +1899,33 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
-#endif // LIBYUV_USE_ST2
-static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
- 10, 9, 8, 11, 14, 13, 12, 15};
+static const uvec8 kShuffleARGBToAB64[2] = {
+ {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
+ {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64,
int width) {
asm volatile(
- "ldr q4, [%3] \n" // shuffler
+ "ldp q6, q7, [%3] \n" // 2 shufflers
"1: \n"
- "ldp q0, q2, [%0], #32 \n" // load 8 pixels
- "tbl v0.16b, {v0.16b}, v4.16b \n"
- "tbl v2.16b, {v2.16b}, v4.16b \n"
- "prfm pldl1keep, [%0, 448] \n"
- "mov v1.16b, v0.16b \n"
- "mov v3.16b, v2.16b \n"
+ "ldp q0, q1, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
- "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_ab64), // %1
- "+r"(width) // %2
- : "r"(&kShuffleARGBToABGR) // %3
+ "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
+ "tbl v3.16b, {v0.16b}, v7.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v4.16b, {v1.16b}, v6.16b \n"
+ "tbl v5.16b, {v1.16b}, v7.16b \n"
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToAB64[0]) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
+#endif // LIBYUV_USE_ST2
static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31};
@@ -3720,10 +3892,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
: "cc", "memory", "v0", "v1", "v2");
}
#else
-static const uvec8 kYUV24Shuffle[3] =
- {{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 },
- { 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 },
- { 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }};
+static const uvec8 kYUV24Shuffle[3] = {
+ {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
+ {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
+ {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
// Convert biplanar NV21 to packed YUV24
// NV21 has VU in memory for chroma.
@@ -3733,27 +3905,29 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
uint8_t* dst_yuv24,
int width) {
asm volatile(
- "ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants
+ "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
- "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
- "tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
+ "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
+ "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
"prfm pldl1keep, [%0, 448] \n"
- "tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n"
+ "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
"prfm pldl1keep, [%1, 448] \n"
- "tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n"
- "subs %w3, %w3, #16 \n" // 16 pixels per loop
- "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels
+ "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_yuv24), // %2
- "+r"(width) // %3
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
: "r"(&kYUV24Shuffle[0]) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#endif // LIBYUV_USE_ST3
+// Note ST2 8b version is faster than zip+ST1
+
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
@@ -3915,32 +4089,6 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
: "cc", "memory", "v0", "v1", "v2");
}
-void MergeUVRow_16_NEON(const uint16_t* src_u,
- const uint16_t* src_v,
- uint16_t* dst_uv,
- int depth,
- int width) {
- int shift = 16 - depth;
- asm volatile(
- "dup v2.8h, %w4 \n"
- "1: \n"
- "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
- "subs %w3, %w3, #8 \n" // 8 src pixels per loop
- "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
- "ushl v0.8h, v0.8h, v2.8h \n"
- "prfm pldl1keep, [%0, 448] \n"
- "ushl v1.8h, v1.8h, v2.8h \n"
- "prfm pldl1keep, [%1, 448] \n"
- "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
- "b.gt 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- : "r"(shift) // %4
- : "cc", "memory", "v0", "v1", "v2");
-}
-
void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,