aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2020-05-04 12:32:28 -0700
committerCommit Bot <commit-bot@chromium.org>2020-05-04 22:32:14 +0000
commit7a61759f78e37113221cfe7c40c522aa505280af (patch)
tree6890e589788e8ec6c743544e9a3c3ccc5377fc8b
parentd9681c53b3af633ab3c64655fcb9625e364b8f9c (diff)
downloadlibyuv-7a61759f78e37113221cfe7c40c522aa505280af.tar.gz
NV12Mirror and MirrorUVPlane functions added
HalfMergeUV AVX2 version Skylake Xeon performance for 1280x720 NV12Mirror_Any (109 ms) NV12Mirror_Unaligned (113 ms) NV12Mirror_Invert (107 ms) NV12Mirror_Opt (108 ms) NV12Mirror_NullY (19 ms) Slightly faster than comparable I420Mirror I420Mirror_Any (113 ms) I420Mirror_Unaligned (110 ms) I420Mirror_Invert (109 ms) I420Mirror_Opt (110 ms) BUG=libyuv:840, libyuv:858 Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r--README.chromium2
-rw-r--r--docs/formats.md1
-rw-r--r--docs/getting_started.md2
-rw-r--r--include/libyuv/planar_functions.h25
-rw-r--r--include/libyuv/row.h20
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/convert.cc4
-rw-r--r--source/planar_functions.cc96
-rw-r--r--source/rotate.cc2
-rw-r--r--source/rotate_argb.cc2
-rw-r--r--source/row_any.cc11
-rw-r--r--source/row_common.cc11
-rw-r--r--source/row_gcc.cc114
-rw-r--r--source/row_neon.cc20
-rw-r--r--source/row_neon64.cc125
-rw-r--r--unit_test/convert_test.cc1
-rw-r--r--unit_test/planar_test.cc85
17 files changed, 434 insertions, 89 deletions
diff --git a/README.chromium b/README.chromium
index 994c4fcb..51381f24 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1749
+Version: 1751
License: BSD
License File: LICENSE
diff --git a/docs/formats.md b/docs/formats.md
index 260dd731..771fb460 100644
--- a/docs/formats.md
+++ b/docs/formats.md
@@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 4426b606..3e339712 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -190,7 +190,7 @@ mips
make V=1 -f linux.mk
make V=1 -f linux.mk clean
- make V=1 -f linux.mk CXX=clang++
+ make V=1 -f linux.mk CXX=clang++ CC=clang
## Building the library with cmake
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 9caef1b5..b11776a0 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -315,6 +315,22 @@ int I400Mirror(const uint8_t* src_y,
int height);
// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
@@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
int width,
int height);
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8_t* src_y,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 02abff6f..c4c3dd44 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -274,16 +274,18 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_HALFMERGEUVROW_SSSE3
-// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
+
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
@@ -299,6 +301,7 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
@@ -368,6 +371,7 @@ extern "C" {
#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
#define HAS_MIRRORSPLITUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON
@@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
@@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
uint8_t* dst_uv,
int width);
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d8d586dc..b9ab8296 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1749
+#define LIBYUV_VERSION 1751
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 2d12bb9c..e3e282eb 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -775,7 +775,7 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@@ -1476,7 +1476,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
-#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
RGB24ToYRow = RGB24ToYRow_Any_MSA;
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 02171ff6..7980dcfa 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
}
}
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
@@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
HalfMergeUVRow = HalfMergeUVRow_SSSE3;
}
#endif
-
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
// Merge a row of U and V into a row of UV.
HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
diff --git a/source/rotate.cc b/source/rotate.cc
index 0954882d..32904e47 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
int width) = MirrorSplitUVRow_C;
#if defined(HAS_MIRRORSPLITUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
MirrorSplitUVRow = MirrorSplitUVRow_NEON;
}
#endif
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index 5ef9266f..ae653886 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
diff --git a/source/row_any.cc b/source/row_any.cc
index 2e9538bd..36c721bb 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#ifdef HAS_MIRRORROW_MMI
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
diff --git a/source/row_common.cc b/source/row_common.cc
index 2d0f27d4..5e801daf 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
void MirrorSplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index e2088561..c4a9579d 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -3229,10 +3229,62 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif // HAS_MIRRORROW_AVX2
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -3253,11 +3305,11 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
"lea 0x8(%1),%1 \n"
"sub $8,%3 \n"
"jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
: "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_MIRRORSPLITUVROW_SSSE3
@@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "1: \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/source/row_neon.cc b/source/row_neon.cc
index aecdf329..12591d33 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
: "cc", "memory", "q0", "q1", "q2");
}
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index f9e0fd36..d26d7abf 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "ld1 {v3.16b}, [%4] \n" // shuffler
+ "ld1 {v3.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #32 \n"
"1: \n"
- "ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #32 \n" // 32 pixels per loop.
- "tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- : "r"((ptrdiff_t)-32), // %3
- "r"(&kShuffleMirror) // %4
+ : "r"(&kShuffleMirror) // %3
: "cc", "memory", "v0", "v1", "v2", "v3");
}
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
// Start at end of source row.
+ "ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1");
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "ld1 {v4.16b}, [%4] \n" // shuffler
- "add %0, %0, %w2, sxtw #2 \n" // Start at end of row.
- "sub %0, %0, #64 \n"
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
"1: \n"
- "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "tbl v0.16b, {v0.16b}, v4.16b \n"
- "tbl v1.16b, {v1.16b}, v4.16b \n"
- "tbl v2.16b, {v2.16b}, v4.16b \n"
- "tbl v3.16b, {v3.16b}, v4.16b \n"
- "st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-64), // %3
- "r"(&kShuffleMirror) // %4
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
@@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3");
}
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
- "orr v2.16b, v0.16b, v0.16b \n" // move U after V
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
"b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_vu), // %1
- "+r"(width) // %2
- :
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
: "cc", "memory", "v0", "v1", "v2");
}
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 6765ebfa..323f8d22 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 31a42535..50bca4e4 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
}
}
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
- SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 4);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
- for (int i = 0; i < 1280; ++i) {
- orig_pixels[i][0] = i;
- orig_pixels[i][1] = i / 2;
- orig_pixels[i][2] = i / 3;
- orig_pixels[i][3] = i / 4;
- }
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+ MaskCpuFlags(disable_cpu_flags_);
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
- EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
- EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
- EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+ benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
}
-TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
- SIMD_ALIGNED(uint8_t orig_pixels[1280]);
- SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
- for (int i = 0; i < 1280; ++i) {
- orig_pixels[i] = i;
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+ benchmark_width_, benchmark_height_);
}
- MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
- for (int i = 0; i < 1280; ++i) {
- EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+ align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_opt,
+ benchmark_width_ * benchmark_height_ * 2);
+ align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+ MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+ MaskCpuFlags(disable_cpu_flags_);
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+ benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
- for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
- MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, TestShade) {