aboutsummaryrefslogtreecommitdiff
path: root/source/rotate_common.cc
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2023-02-13 10:52:58 -0800
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-02-13 20:14:57 +0000
commit2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 (patch)
treec446d71538c965d0e5391ef77cd49b45ba51463d /source/rotate_common.cc
parentb2528b0be934de1918e20c85fc170d809eeb49ab (diff)
downloadlibyuv-2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2.tar.gz
MergeUV_AVX512BW for I420ToNV12
On Skylake Xeon 640x360 100000 iterations AVX512 MergeUVPlane_Opt (1196 ms) AVX2 MergeUVPlane_Opt (1565 ms) SSE2 MergeUVPlane_Opt (1780 ms) Pixel 7 MergeUVPlane_Opt (1177 ms) Bug: None Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
Diffstat (limited to 'source/rotate_common.cc')
-rw-r--r--source/rotate_common.cc57
1 files changed, 57 insertions, 0 deletions
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
index 2617c01b..4b496d1b 100644
--- a/source/rotate_common.cc
+++ b/source/rotate_common.cc
@@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src,
}
}
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src1 = src + src_stride;
+ const uint8_t* src2 = src1 + src_stride;
+ const uint8_t* src3 = src2 + src_stride;
+ uint8_t* dst1 = dst + dst_stride;
+ uint8_t* dst2 = dst1 + dst_stride;
+ uint8_t* dst3 = dst2 + dst_stride;
+ int i;
+ for (i = 0; i < width; i += 4) {
+ uint32_t p00 = ((uint32_t*)(src))[0];
+ uint32_t p10 = ((uint32_t*)(src))[1];
+ uint32_t p20 = ((uint32_t*)(src))[2];
+ uint32_t p30 = ((uint32_t*)(src))[3];
+ uint32_t p01 = ((uint32_t*)(src1))[0];
+ uint32_t p11 = ((uint32_t*)(src1))[1];
+ uint32_t p21 = ((uint32_t*)(src1))[2];
+ uint32_t p31 = ((uint32_t*)(src1))[3];
+ uint32_t p02 = ((uint32_t*)(src2))[0];
+ uint32_t p12 = ((uint32_t*)(src2))[1];
+ uint32_t p22 = ((uint32_t*)(src2))[2];
+ uint32_t p32 = ((uint32_t*)(src2))[3];
+ uint32_t p03 = ((uint32_t*)(src3))[0];
+ uint32_t p13 = ((uint32_t*)(src3))[1];
+ uint32_t p23 = ((uint32_t*)(src3))[2];
+ uint32_t p33 = ((uint32_t*)(src3))[3];
+ ((uint32_t*)(dst))[0] = p00;
+ ((uint32_t*)(dst))[1] = p01;
+ ((uint32_t*)(dst))[2] = p02;
+ ((uint32_t*)(dst))[3] = p03;
+ ((uint32_t*)(dst1))[0] = p10;
+ ((uint32_t*)(dst1))[1] = p11;
+ ((uint32_t*)(dst1))[2] = p12;
+ ((uint32_t*)(dst1))[3] = p13;
+ ((uint32_t*)(dst2))[0] = p20;
+ ((uint32_t*)(dst2))[1] = p21;
+ ((uint32_t*)(dst2))[2] = p22;
+ ((uint32_t*)(dst2))[3] = p23;
+ ((uint32_t*)(dst3))[0] = p30;
+ ((uint32_t*)(dst3))[1] = p31;
+ ((uint32_t*)(dst3))[2] = p32;
+ ((uint32_t*)(dst3))[3] = p33;
+ src += src_stride * 4; // advance 4 rows
+ src1 += src_stride * 4;
+ src2 += src_stride * 4;
+ src3 += src_stride * 4;
+ dst += 4 * 4; // advance 4 columns
+ dst1 += 4 * 4;
+ dst2 += 4 * 4;
+ dst3 += 4 * 4;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv