diff options
author | Frank Barchard <fbarchard@google.com> | 2020-05-04 12:32:28 -0700 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-05-04 22:32:14 +0000 |
commit | 7a61759f78e37113221cfe7c40c522aa505280af (patch) | |
tree | 6890e589788e8ec6c743544e9a3c3ccc5377fc8b | |
parent | d9681c53b3af633ab3c64655fcb9625e364b8f9c (diff) | |
download | libyuv-7a61759f78e37113221cfe7c40c522aa505280af.tar.gz |
NV12Mirror and MirrorUVPlane functions added
HalfMergeUV AVX2 version
Skylake Xeon performance for 1280x720
NV12Mirror_Any (109 ms)
NV12Mirror_Unaligned (113 ms)
NV12Mirror_Invert (107 ms)
NV12Mirror_Opt (108 ms)
NV12Mirror_NullY (19 ms)
Slightly faster than comparable I420Mirror
I420Mirror_Any (113 ms)
I420Mirror_Unaligned (110 ms)
I420Mirror_Invert (109 ms)
I420Mirror_Opt (110 ms)
BUG=libyuv:840, libyuv:858
Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | docs/formats.md | 1 | ||||
-rw-r--r-- | docs/getting_started.md | 2 | ||||
-rw-r--r-- | include/libyuv/planar_functions.h | 25 | ||||
-rw-r--r-- | include/libyuv/row.h | 20 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | source/convert.cc | 4 | ||||
-rw-r--r-- | source/planar_functions.cc | 96 | ||||
-rw-r--r-- | source/rotate.cc | 2 | ||||
-rw-r--r-- | source/rotate_argb.cc | 2 | ||||
-rw-r--r-- | source/row_any.cc | 11 | ||||
-rw-r--r-- | source/row_common.cc | 11 | ||||
-rw-r--r-- | source/row_gcc.cc | 114 | ||||
-rw-r--r-- | source/row_neon.cc | 20 | ||||
-rw-r--r-- | source/row_neon64.cc | 125 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 1 | ||||
-rw-r--r-- | unit_test/planar_test.cc | 85 |
17 files changed, 434 insertions, 89 deletions
diff --git a/README.chromium b/README.chromium index 994c4fcb..51381f24 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1749 +Version: 1751 License: BSD License File: LICENSE diff --git a/docs/formats.md b/docs/formats.md index 260dd731..771fb460 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half height chroma channel, and therefore is a 420 subsampling. NV16 is 16 bits per pixel, with half width and full height. aka 422. NV24 is 24 bits per pixel with full sized chroma channel. aka 444. +Most NV12 functions allow the destination Y pointer to be NULL. diff --git a/docs/getting_started.md b/docs/getting_started.md index 4426b606..3e339712 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -190,7 +190,7 @@ mips make V=1 -f linux.mk make V=1 -f linux.mk clean - make V=1 -f linux.mk CXX=clang++ + make V=1 -f linux.mk CXX=clang++ CC=clang ## Building the library with cmake diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 9caef1b5..b11776a0 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -315,6 +315,22 @@ int I400Mirror(const uint8_t* src_y, int height); // Alias +#define NV12ToNV12Mirror NV12Mirror + +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Alias #define ARGBToARGBMirror ARGBMirror // ARGB mirror. @@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y, int width, int height); +// Mirror a plane of UV data. +LIBYUV_API +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert NV12 to RGB565. LIBYUV_API int NV12ToRGB565(const uint8_t* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 02abff6f..c4c3dd44 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -274,16 +274,18 @@ extern "C" { #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 -// I210 is for H010. 2 = 422. I for 601 vs H for 709. #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 +#define HAS_MIRRORUVROW_AVX2 +#define HAS_MIRRORUVROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 + #endif // The following are available for AVX2 gcc/clang x86 platforms: @@ -299,6 +301,7 @@ extern "C" { #define HAS_ARGBTORGB24ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_HALFMERGEUVROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 @@ -368,6 +371,7 @@ extern "C" { #define HAS_J400TOARGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON #define HAS_MIRRORSPLITUVROW_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB24ROW_NEON @@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, @@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, uint8_t* dst_uv, int width); +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d8d586dc..b9ab8296 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1749 +#define LIBYUV_VERSION 1751 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 2d12bb9c..e3e282eb 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -775,7 +775,7 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToUVRow = YUY2ToUVRow_Any_MSA; @@ -1476,7 +1476,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) +#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToUVRow = RGB24ToUVRow_Any_MSA; RGB24ToYRow = RGB24ToYRow_Any_MSA; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 02171ff6..7980dcfa 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y, } } +// Mirror a plane of UV data. +LIBYUV_API +void MirrorUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = + MirrorUVRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uv = src_uv + (height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorUVRow = MirrorUVRow_Any_NEON; + if (IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_NEON; + } + } +#endif +#if defined(HAS_MIRRORUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorUVRow = MirrorUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_SSSE3; + } + } +#endif +#if defined(HAS_MIRRORUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorUVRow = MirrorUVRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MirrorUVRow = MirrorUVRow_AVX2; + } + } +#endif + + // MirrorUV plane + for (y = 0; y < height; ++y) { + MirrorUVRow(src_uv, dst_uv, width); + src_uv += src_stride_uv; + dst_uv += dst_stride_uv; + } +} + // Mirror I400 with optional flipping LIBYUV_API int I400Mirror(const uint8_t* src_y, @@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y, return 0; } +// NV12 mirror. +LIBYUV_API +int NV12Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || !dst_uv || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, + halfheight); + return 0; +} + // ARGB mirror. LIBYUV_API int ARGBMirror(const uint8_t* src_argb, @@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_SSSE3; } #endif - +#if defined(HAS_HALFMERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + HalfMergeUVRow = HalfMergeUVRow_AVX2; + } +#endif for (y = 0; y < height - 1; y += 2) { // Merge a row of U and V into a row of UV. HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); diff --git a/source/rotate.cc b/source/rotate.cc index 0954882d..32904e47 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src, void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) = MirrorSplitUVRow_C; #if defined(HAS_MIRRORSPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 5ef9266f..ae653886 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index 2e9538bd..36c721bb 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #ifdef HAS_MIRRORROW_MMI ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) #endif +#ifdef HAS_MIRRORUVROW_AVX2 +ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) +#endif +#ifdef HAS_MIRRORUVROW_SSSE3 +ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) +#endif +#ifdef HAS_MIRRORUVROW_NEON +ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #endif #ifdef HAS_ARGBMIRRORROW_NEON -ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15) +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) diff --git a/source/row_common.cc b/source/row_common.cc index 2d0f27d4..5e801daf 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } +void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_uv[0]; + dst_uv[1] = src_uv[1]; + src_uv -= 2; + dst_uv += 2; + } +} + void MirrorSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index e2088561..c4a9579d 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3229,10 +3229,62 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 +#ifdef HAS_MIRRORUVROW_SSSE3 +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_SSSE3 + +#ifdef HAS_MIRRORUVROW_AVX2 +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); +} +#endif // HAS_MIRRORUVROW_AVX2 + #ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, @@ -3253,11 +3305,11 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, "lea 0x8(%1),%1 \n" "sub $8,%3 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_MIRRORSPLITUVROW_SSSE3 @@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } +void HalfMergeUVRow_AVX2(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "1: \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // load 32 U values + "vmovdqu (%1),%%ymm1 \n" // load 32 V values + "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row + "vmovdqu 0(%1,%5,1),%%ymm3 \n" + "lea 0x20(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels + "lea 0x20(%2),%2 \n" + "sub $0x20,%3 \n" // 32 src pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_neon.cc b/source/row_neon.cc index aecdf329..12591d33 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { : "cc", "memory", "q0", "q1", "q2"); } +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %2, lsl #1 \n" + "sub %0, #16 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r12", "q0"); +} + void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index f9e0fd36..d26d7abf 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "ld1 {v3.16b}, [%4] \n" // shuffler + "ld1 {v3.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw \n" "sub %0, %0, #32 \n" "1: \n" - "ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32 + "ldr q2, [%0, 16] \n" + "ldr q1, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #32 \n" // 32 pixels per loop. - "tbl v1.16b, {v1.16b}, v3.16b \n" "tbl v0.16b, {v2.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "r"((ptrdiff_t)-32), // %3 - "r"(&kShuffleMirror) // %4 + : "r"(&kShuffleMirror) // %3 : "cc", "memory", "v0", "v1", "v2", "v3"); } +// Shuffle table for reversing the UV. +static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, + 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; + +void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + asm volatile( + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorUV) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( // Start at end of source row. + "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1"); + "sub %0, %0, #32 \n" + "1: \n" + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w3, %w3, #16 \n" // 16 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "uzp1 v0.16b, v2.16b, v3.16b \n" // U + "uzp2 v1.16b, v2.16b, v3.16b \n" // V + "st1 {v0.16b}, [%1], #16 \n" // dst += 16 + "st1 {v1.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&kShuffleMirrorUV) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } +// Shuffle table for reversing the ARGB. +static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, + 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; + void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "ld1 {v4.16b}, [%4] \n" // shuffler - "add %0, %0, %w2, sxtw #2 \n" // Start at end of row. - "sub %0, %0, #64 \n" + // Start at end of source row. + "ld1 {v4.16b}, [%3] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #32 \n" "1: \n" - "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v0.16b, {v0.16b}, v4.16b \n" - "tbl v1.16b, {v1.16b}, v4.16b \n" - "tbl v2.16b, {v2.16b}, v4.16b \n" - "tbl v3.16b, {v3.16b}, v4.16b \n" - "st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-64), // %3 - "r"(&kShuffleMirror) // %4 + "ldr q1, [%0, 16] \n" + "ldr q0, [%0], -32 \n" // src -= 32 + "subs %w2, %w2, #8 \n" // 8 pixels per loop. + "tbl v2.16b, {v1.16b}, v4.16b \n" + "tbl v3.16b, {v0.16b}, v4.16b \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleMirrorARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } @@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3"); } +// Shuffle table for swapping UV bytes. +static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values - "orr v2.16b, v0.16b, v0.16b \n" // move U after V + "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values + "ld1 {v1.16b}, [%0], 16 \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels + "tbl v0.16b, {v0.16b}, v2.16b \n" + "tbl v1.16b, {v1.16b}, v2.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "stp q0, q1, [%1], 32 \n" // store 16 VU pixels "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "r"(&kShuffleSwapUV) // %3 : "cc", "memory", "v0", "v1", "v2"); } diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 6765ebfa..323f8d22 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) +TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 31a42535..50bca4e4 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) { } } -TEST_F(LibYUVPlanarTest, TestARGBMirror) { - SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); - SIMD_ALIGNED(uint8_t dst_pixels[1280][4]); +TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4); + align_buffer_page_end(dst_pixels_opt, + benchmark_width_ * benchmark_height_ * 4); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4); - for (int i = 0; i < 1280; ++i) { - orig_pixels[i][0] = i; - orig_pixels[i][1] = i / 2; - orig_pixels[i][2] = i / 3; - orig_pixels[i][3] = i / 4; - } - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4); + MaskCpuFlags(disable_cpu_flags_); + ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); - for (int i = 0; i < 1280; ++i) { - EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]); - EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]); - EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]); - EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]); + for (int i = 0; i < benchmark_iterations_; ++i) { + ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt, + benchmark_width_ * 4, benchmark_width_, benchmark_height_); } - for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); + for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); } -TEST_F(LibYUVPlanarTest, TestMirrorPlane) { - SIMD_ALIGNED(uint8_t orig_pixels[1280]); - SIMD_ALIGNED(uint8_t dst_pixels[1280]); +TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_); - for (int i = 0; i < 1280; ++i) { - orig_pixels[i] = i; + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_); + MaskCpuFlags(disable_cpu_flags_); + MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_, + benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_, + benchmark_width_, benchmark_height_); } - MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1); + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} - for (int i = 0; i < 1280; ++i) { - EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]); +TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) { + align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2); + align_buffer_page_end(dst_pixels_opt, + benchmark_width_ * benchmark_height_ * 2); + align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2); + + MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2); + MaskCpuFlags(disable_cpu_flags_); + MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c, + benchmark_width_ * 2, benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt, + benchmark_width_ * 2, benchmark_width_, benchmark_height_); } - for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1); + for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, TestShade) { |