aboutsummaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-08-17 11:20:36 -0700
committerFrank Barchard <fbarchard@chromium.org>2022-08-17 18:39:05 +0000
commit3e38ce50589d9319badc0501f96d6c5b2b177472 (patch)
tree52ab2947f8d91d9463d6685099a0360e016a1421 /source
parent65e7c9d5706a77d1949da59bfcb0817c252ef8d6 (diff)
downloadlibyuv-3e38ce50589d9319badc0501f96d6c5b2b177472.tar.gz
SSE2 MM21->YUY2 conversion
Add SSE2 optimization for MM21ToYUY2 conversion. Bug: b/238137982 Change-Id: I189f712514308322f651b082b496bce9c015c4ee Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3832525 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
Diffstat (limited to 'source')
-rw-r--r--source/convert.cc4
-rw-r--r--source/planar_functions.cc14
-rw-r--r--source/row_any.cc4
-rw-r--r--source/row_gcc.cc33
-rw-r--r--source/scale.cc2
5 files changed, 51 insertions, 6 deletions
diff --git a/source/convert.cc b/source/convert.cc
index 37066721..a740d5ca 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y,
height);
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
dst_stride_u, src_u, dst_u, 0, 32768, dy,
- /*wpp=*/1, scale, kFilterBilinear);
+ /*bpp=*/1, scale, kFilterBilinear);
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
dst_stride_v, src_v, dst_v, 0, 32768, dy,
- /*wpp=*/1, scale, kFilterBilinear);
+ /*bpp=*/1, scale, kFilterBilinear);
}
return 0;
}
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index fae8630e..1de71dbb 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y,
// tile width is 16 and assumed.
// tile_height is 16 or 32 for MM21.
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
-// TODO(fbarchard): More detile row functions.
+// TODO: More detile row functions.
LIBYUV_API
void DetilePlane(const uint8_t* src_y,
@@ -1074,6 +1074,15 @@ void DetileToYUY2(const uint8_t* src_y,
}
#endif
+#if defined(HAS_DETILETOYUY2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ DetileToYUY2 = DetileToYUY2_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ DetileToYUY2 = DetileToYUY2_SSE2;
+ }
+ }
+#endif
+
// Detile plane
for (y = 0; y < height; ++y) {
DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride,
@@ -1081,9 +1090,8 @@ void DetileToYUY2(const uint8_t* src_y,
dst_yuy2 += dst_stride_yuy2;
src_y += 16;
- if (y & 0x1) {
+ if (y & 0x1)
src_uv += 16;
- }
// Advance to next row of tiles.
if ((y & (tile_height - 1)) == (tile_height - 1)) {
diff --git a/source/row_any.cc b/source/row_any.cc
index 5270e86c..bd46ba1b 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2272,6 +2272,10 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
#endif
+#ifdef HAS_DETILETOYUY2_SSE2
+ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
+#endif
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 88766785..8d0f477c 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -4968,6 +4968,39 @@ void DetileRow_SSE2(const uint8_t* src,
}
#endif // HAS_DETILEROW_SSE2
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+ ptrdiff_t src_y_tile_stride,
+ const uint8_t* src_uv,
+ ptrdiff_t src_uv_tile_stride,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // Load 16 Y
+ "sub $0x10,%3 \n"
+ "lea (%0,%4),%0 \n"
+ "movdqu (%1),%%xmm1 \n" // Load 8 UV
+ "lea (%1,%5),%1 \n"
+ "movdqu %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_yuy2), // %2
+ "+r"(width) // %3
+ : "r"(src_y_tile_stride), // %4
+ "r"(src_uv_tile_stride) // %5
+ : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
+ );
+}
+#endif
+
#ifdef HAS_DETILESPLITUVROW_SSSE3
// TODO(greenjustin): Look into generating these constants instead of loading
// them since this can cause branch mispredicts for fPIC code on 32-bit
diff --git a/source/scale.cc b/source/scale.cc
index 4980f42d..e1335f1e 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src,
}
// Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering);
+ dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {