aboutsummaryrefslogtreecommitdiff
path: root/source/row_neon64.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-09-30 15:12:37 -0700
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2022-09-30 22:41:21 +0000
commit00950840d1c9bcbb3eb6ebc5aac5793e71166c8b (patch)
treea261be18062391f039e4e230ce5714f0059ae845 /source/row_neon64.cc
parent9ba40a8f03673b79d3236e79707723fdf99f76b6 (diff)
downloadlibyuv-00950840d1c9bcbb3eb6ebc5aac5793e71166c8b.tar.gz
YUY2ToNV12 using YUY2ToY and YUY2ToNVUV
- Optimized YUY2ToNV12 that reduces it from 3 steps to 2 steps - Was SplitUV, memcpy Y, InterpolateUV - Now YUY2ToY, YUY2ToNVUV - rollback LIBYUV_UNLIMITED_DATA 3840x2160 1000 iterations: Pixel 2 Cortex A73 Was YUY2ToNV12_Opt (6515 ms) Now YUY2ToNV12_Opt (3350 ms) AB7 Mediatek P35 Cortex A53 Was YUY2ToNV12_Opt (6435 ms) Now YUY2ToNV12_Opt (3301 ms) Skylake AVX2 x64 Was YUY2ToNV12_Opt (1872 ms) Now YUY2ToNV12_Opt (1657 ms) SSE2 x64 Was YUY2ToNV12_Opt (2008 ms) Now YUY2ToNV12_Opt (1691 ms) Windows Skylake AVX2 32 bit x86 Was YUY2ToNV12_Opt (2161 ms) Now YUY2ToNV12_Opt (1628 ms) Bug: libyuv:943 Change-Id: I6c2ba2ae765413426baf770b837de114f808f6d0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3929843 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r--source/row_neon64.cc23
1 files changed, 23 insertions, 0 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 3cbd9b79..880a5f06 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1808,6 +1808,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
);
}
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
+ "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,