aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/libyuv/row.h17
-rw-r--r--source/planar_functions.cc12
-rw-r--r--source/row_any.cc3
-rw-r--r--source/row_gcc.cc22
4 files changed, 52 insertions, 2 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 8bbb1309..861c6d3e 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -291,6 +291,7 @@ extern "C" {
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_DETILEROW_SSE2
#define HAS_DETILEROW_16_SSE2
+#define HAS_DETILEROW_16_AVX
#define HAS_DETILESPLITUVROW_SSSE3
#define HAS_DETILETOYUY2_SSE2
#define HAS_HALFMERGEUVROW_SSSE3
@@ -2030,6 +2031,14 @@ void DetileRow_Any_SSE2(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
int width);
+void DetileRow_AVX(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
+void DetileRow_Any_AVX(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width);
void DetileRow_16_C(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
@@ -2050,6 +2059,14 @@ void DetileRow_16_Any_SSE2(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
+void DetileRow_16_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
+void DetileRow_16_Any_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width);
void DetileSplitUVRow_C(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,
uint8_t* dst_u,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 25f577b7..96914e08 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1002,7 +1002,7 @@ int DetilePlane_16(const uint16_t* src_y,
dst_stride_y = -dst_stride_y;
}
-#if defined(HAS_DETILEROW_SSE2)
+#if defined(HAS_DETILEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
DetileRow_16 = DetileRow_16_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
@@ -1010,7 +1010,15 @@ int DetilePlane_16(const uint16_t* src_y,
}
}
#endif
-#if defined(HAS_DETILEROW_NEON)
+#if defined(HAS_DETILEROW_16_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ DetileRow_16 = DetileRow_16_Any_AVX;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow_16 = DetileRow_16_AVX;
+ }
+ }
+#endif
+#if defined(HAS_DETILEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
DetileRow_16 = DetileRow_16_Any_NEON;
if (IS_ALIGNED(width, 16)) {
diff --git a/source/row_any.cc b/source/row_any.cc
index 4b60fa0f..3c7dc893 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2268,6 +2268,9 @@ ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
#ifdef HAS_DETILEROW_16_SSE2
ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
#endif
+#ifdef HAS_DETILEROW_16_AVX
+ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15)
+#endif
#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 2ce7da91..f36d0cf0 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5051,6 +5051,28 @@ void DetileRow_16_SSE2(const uint16_t* src,
}
#endif // HAS_DETILEROW_SSE2
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+ ptrdiff_t src_tile_stride,
+ uint16_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea (%0,%3,2),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0");
+}
+#endif // HAS_DETILEROW_AVX
+
#ifdef HAS_DETILETOYUY2_SSE2
// Read 16 Y, 8 UV, and write 8 YUYV.
void DetileToYUY2_SSE2(const uint8_t* src_y,