diff options
author | Frank Barchard <fbarchard@google.com> | 2022-08-17 11:20:36 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2022-08-17 18:39:05 +0000 |
commit | 3e38ce50589d9319badc0501f96d6c5b2b177472 (patch) | |
tree | 52ab2947f8d91d9463d6685099a0360e016a1421 /source | |
parent | 65e7c9d5706a77d1949da59bfcb0817c252ef8d6 (diff) | |
download | libyuv-3e38ce50589d9319badc0501f96d6c5b2b177472.tar.gz |
SSE2 MM21->YUY2 conversion
Add SSE2 optimization for MM21ToYUY2 conversion.
Bug: b/238137982
Change-Id: I189f712514308322f651b082b496bce9c015c4ee
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3832525
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
Diffstat (limited to 'source')
-rw-r--r-- | source/convert.cc | 4 | ||||
-rw-r--r-- | source/planar_functions.cc | 14 | ||||
-rw-r--r-- | source/row_any.cc | 4 | ||||
-rw-r--r-- | source/row_gcc.cc | 33 | ||||
-rw-r--r-- | source/scale.cc | 2 |
5 files changed, 51 insertions, 6 deletions
diff --git a/source/convert.cc b/source/convert.cc index 37066721..a740d5ca 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y, height); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, dst_stride_u, src_u, dst_u, 0, 32768, dy, - /*wpp=*/1, scale, kFilterBilinear); + /*bpp=*/1, scale, kFilterBilinear); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, dst_stride_v, src_v, dst_v, 0, 32768, dy, - /*wpp=*/1, scale, kFilterBilinear); + /*bpp=*/1, scale, kFilterBilinear); } return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fae8630e..1de71dbb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y, // tile width is 16 and assumed. // tile_height is 16 or 32 for MM21. // src_stride_y is bytes per row of source ignoring tiling. e.g. 640 -// TODO(fbarchard): More detile row functions. +// TODO: More detile row functions. LIBYUV_API void DetilePlane(const uint8_t* src_y, @@ -1074,6 +1074,15 @@ void DetileToYUY2(const uint8_t* src_y, } #endif +#if defined(HAS_DETILETOYUY2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileToYUY2 = DetileToYUY2_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_SSE2; + } + } +#endif + // Detile plane for (y = 0; y < height; ++y) { DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, @@ -1081,9 +1090,8 @@ void DetileToYUY2(const uint8_t* src_y, dst_yuy2 += dst_stride_yuy2; src_y += 16; - if (y & 0x1) { + if (y & 0x1) src_uv += 16; - } // Advance to next row of tiles. if ((y & (tile_height - 1)) == (tile_height - 1)) { diff --git a/source/row_any.cc b/source/row_any.cc index 5270e86c..bd46ba1b 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2272,6 +2272,10 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) #endif +#ifdef HAS_DETILETOYUY2_SSE2 +ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 88766785..8d0f477c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4968,6 +4968,39 @@ void DetileRow_SSE2(const uint8_t* src, } #endif // HAS_DETILEROW_SSE2 +#ifdef HAS_DETILETOYUY2_SSE2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" // Load 16 Y + "sub $0x10,%3 \n" + "lea (%0,%4),%0 \n" + "movdqu (%1),%%xmm1 \n" // Load 8 UV + "lea (%1,%5),%1 \n" + "movdqu %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list + ); +} +#endif + #ifdef HAS_DETILESPLITUVROW_SSSE3 // TODO(greenjustin): Look into generating these constants instead of loading // them since this can cause branch mispredicts for fPIC code on 32-bit diff --git a/source/scale.cc b/source/scale.cc index 4980f42d..e1335f1e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src, } // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { |