aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2023-02-13 10:52:58 -0800
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-02-13 20:14:57 +0000
commit2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 (patch)
treec446d71538c965d0e5391ef77cd49b45ba51463d /source/row_neon64.cc
parentb2528b0be934de1918e20c85fc170d809eeb49ab (diff)
downloadlibyuv-2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2.tar.gz
MergeUV_AVX512BW for I420ToNV12
On Skylake Xeon 640x360 100000 iterations AVX512 MergeUVPlane_Opt (1196 ms) AVX2 MergeUVPlane_Opt (1565 ms) SSE2 MergeUVPlane_Opt (1780 ms) Pixel 7 MergeUVPlane_Opt (1177 ms) Bug: None Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc22
1 files changed, 22 insertions, 0 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 7f04b606..df346ee0 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u,
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON1(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U
+ "ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V
+ "subs %w3, %w3, #32 \n" // 32 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,