diff options
-rw-r--r-- | include/libyuv/row.h | 17 | ||||
-rw-r--r-- | source/planar_functions.cc | 12 | ||||
-rw-r--r-- | source/row_any.cc | 3 | ||||
-rw-r--r-- | source/row_gcc.cc | 22 |
4 files changed, 52 insertions, 2 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8bbb1309..861c6d3e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -291,6 +291,7 @@ extern "C" { #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_DETILEROW_SSE2 #define HAS_DETILEROW_16_SSE2 +#define HAS_DETILEROW_16_AVX #define HAS_DETILESPLITUVROW_SSSE3 #define HAS_DETILETOYUY2_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 @@ -2030,6 +2031,14 @@ void DetileRow_Any_SSE2(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); +void DetileRow_AVX(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileRow_Any_AVX(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); void DetileRow_16_C(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, @@ -2050,6 +2059,14 @@ void DetileRow_16_Any_SSE2(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width); +void DetileRow_16_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_Any_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); void DetileSplitUVRow_C(const uint8_t* src_uv, ptrdiff_t src_tile_stride, uint8_t* dst_u, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 25f577b7..96914e08 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1002,7 +1002,7 @@ int DetilePlane_16(const uint16_t* src_y, dst_stride_y = -dst_stride_y; } -#if defined(HAS_DETILEROW_SSE2) +#if defined(HAS_DETILEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { DetileRow_16 = DetileRow_16_Any_SSE2; if (IS_ALIGNED(width, 16)) { @@ -1010,7 +1010,15 @@ int DetilePlane_16(const uint16_t* src_y, } } #endif -#if defined(HAS_DETILEROW_NEON) +#if defined(HAS_DETILEROW_16_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + DetileRow_16 = DetileRow_16_Any_AVX; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_AVX; + } + } +#endif +#if defined(HAS_DETILEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { DetileRow_16 = DetileRow_16_Any_NEON; if (IS_ALIGNED(width, 16)) { diff --git a/source/row_any.cc b/source/row_any.cc index 4b60fa0f..3c7dc893 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2268,6 +2268,9 @@ ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15) #ifdef HAS_DETILEROW_16_SSE2 ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15) #endif +#ifdef HAS_DETILEROW_16_AVX +ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) +#endif #define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \ void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \ diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 2ce7da91..f36d0cf0 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5051,6 +5051,28 @@ void DetileRow_16_SSE2(const uint16_t* src, } #endif // HAS_DETILEROW_SSE2 +#ifdef HAS_DETILEROW_16_AVX +void DetileRow_16_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea (%0,%3,2),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_AVX + #ifdef HAS_DETILETOYUY2_SSE2 // Read 16 Y, 8 UV, and write 8 YUYV. void DetileToYUY2_SSE2(const uint8_t* src_y, |