diff options
author | Frank Barchard <fbarchard@google.com> | 2021-11-23 15:45:19 -0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2021-11-24 07:38:49 +0000 |
commit | 000806f373046ae9501791b92c79a8acf6844ff8 (patch) | |
tree | 3760d371fa29efa11e8f056c414c43ddbb6adc67 /source/row_neon64.cc | |
parent | a04e4f87fbf40405707b1d0ae9fcba8fc93f7856 (diff) | |
download | libyuv-000806f373046ae9501791b92c79a8acf6844ff8.tar.gz |
NV21ToYUV24 replace ST3 with ST1. ARGBToAR64 replace ST2 with ST1
On Samsung S8 Exynos M2
Was ST3 NV21ToYUV24_Opt (769 ms)
Now ST1 NV21ToYUV24_Opt (473 ms)
Was ST2 ARGBToAR64_Opt (1759 ms)
Now ST1 ARGBToAR64_Opt (987 ms)
Skylake Xeon, AVX2 version:
Was NV21ToYUV24_Opt (885 ms)
Now NV21ToYUV24_Opt (194 ms)
Bug: b/204562143, b/124413599
Change-Id: Icc9cb64d822cd11937789a4e04fbb773b3e33aa3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3290664
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 67 |
1 files changed, 64 insertions, 3 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index fff27870..b781bda3 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -15,6 +15,9 @@ namespace libyuv { extern "C" { #endif +// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them. +// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. + // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) @@ -1683,6 +1686,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } +#if LIBYUV_USE_ST2 void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -1702,6 +1706,28 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, : : "cc", "memory", "v0", "v1", "v2", "v3"); } +#else +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "zip1 v2.16b, v0.16b, v0.16b \n" + "zip2 v3.16b, v0.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v4.16b, v1.16b, v1.16b \n" + "zip2 v5.16b, v1.16b, v1.16b \n" + "st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} +#endif // LIBYUV_USE_ST2 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; @@ -3669,6 +3695,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); } +#if LIBYUV_USE_ST3 // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, @@ -3692,8 +3719,42 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, : : "cc", "memory", "v0", "v1", "v2"); } +#else +static const uvec8 kYUV24Shuffle[3] = + {{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 }, + { 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 }, + { 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }}; -// AYUV is YVUA in memory. UV for NV12 is UV order in memory. +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values + "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24 + "prfm pldl1keep, [%0, 448] \n" + "tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n" + "prfm pldl1keep, [%1, 448] \n" + "tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} +#endif // LIBYUV_USE_ST3 + +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, @@ -3708,8 +3769,8 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. @@ -3737,8 +3798,8 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. |