diff options
author | Yuan Tong <tongyuan200097@gmail.com> | 2021-02-25 15:21:28 +0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-02-25 23:16:54 +0000 |
commit | a8c181050c202854ae32433164e6bd5d1e7c4368 (patch) | |
tree | c300dbf9bfa59d0dc2772c311b0dfd154e08d54a | |
parent | 08815a29766a78398a8e2b9ed095280e9d0a73c2 (diff) | |
download | libyuv-a8c181050c202854ae32433164e6bd5d1e7c4368.tar.gz |
Add 10/12 bit YUV To YUV functions
The following functions (and their 12 bit variant) are added:
planar, 10->10:
I410ToI010, I210ToI010
planar, 10->8:
I410ToI444, I210ToI422
planar<->biplanar, 10->10:
I010ToP010, I210ToP210, I410ToP410
P010ToI010, P210ToI210, P410ToI410
R=fbarchard@chromium.org
Change-Id: I9aa2bafa0d6a6e1e38ce4e20cbb437e10f9b0158
Bug: libyuv:834, libyuv:873
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2709822
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r-- | README.chromium | 2 | ||||
-rw-r--r-- | include/libyuv/convert.h | 183 | ||||
-rw-r--r-- | include/libyuv/convert_from.h | 18 | ||||
-rw-r--r-- | include/libyuv/planar_functions.h | 44 | ||||
-rw-r--r-- | include/libyuv/row.h | 92 | ||||
-rw-r--r-- | include/libyuv/version.h | 2 | ||||
-rw-r--r-- | include/libyuv/video_common.h | 11 | ||||
-rw-r--r-- | source/convert.cc | 349 | ||||
-rw-r--r-- | source/convert_from.cc | 44 | ||||
-rw-r--r-- | source/planar_functions.cc | 210 | ||||
-rw-r--r-- | source/row_any.cc | 76 | ||||
-rw-r--r-- | source/row_common.cc | 46 | ||||
-rw-r--r-- | source/row_gcc.cc | 108 | ||||
-rw-r--r-- | source/row_neon.cc | 115 | ||||
-rw-r--r-- | source/row_neon64.cc | 120 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 387 | ||||
-rw-r--r-- | unit_test/video_common_test.cc | 5 |
17 files changed, 1577 insertions, 235 deletions
diff --git a/README.chromium b/README.chromium index 01b05888..51b4a11e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1778 +Version: 1779 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 4e58ad6e..40869ef2 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -193,6 +193,129 @@ int I010ToI420(const uint16_t* src_y, int width, int height); +#define H210ToH422 I210ToI422 +LIBYUV_API +int I210ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define H410ToH444 I410ToI444 +LIBYUV_API +int I410ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define H012ToH420 I012ToI420 +LIBYUV_API +int I012ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define H212ToH422 I212ToI422 +LIBYUV_API +int I212ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define H412ToH444 I412ToI444 +LIBYUV_API +int I412ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define I412ToI012 I410ToI010 +#define H410ToH010 I410ToI010 +#define H412ToH012 I410ToI010 +LIBYUV_API +int I410ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +#define I212ToI012 I210ToI010 +#define H210ToH010 I210ToI010 +#define H212ToH012 I210ToI010 +LIBYUV_API +int I210ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + // Convert I010 to I410 LIBYUV_API int I010ToI410(const uint16_t* src_y, @@ -233,6 +356,66 @@ int I210ToI410(const uint16_t* src_y, // Convert I212 to I412 #define I212ToI412 I210ToI410 +// Convert I010 to P010 +LIBYUV_API +int I010ToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert I210 to P210 +LIBYUV_API +int I210ToP210(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert I012 to P012 +LIBYUV_API +int I012ToP012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert I212 to P212 +LIBYUV_API +int I212ToP212(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert I400 (grey) to I420. LIBYUV_API int I400ToI420(const uint8_t* src_y, diff --git a/include/libyuv/convert_from.h b/include/libyuv/convert_from.h index 5140ed4f..32f42a63 100644 --- a/include/libyuv/convert_from.h +++ b/include/libyuv/convert_from.h @@ -39,6 +39,24 @@ int I420ToI010(const uint8_t* src_y, int width, int height); +// Convert 8 bit YUV to 12 bit. +#define H420ToH012 I420ToI012 +LIBYUV_API +int I420ToI012(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + LIBYUV_API int I420ToI422(const uint8_t* src_y, int src_stride_y, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index ce94e162..ebefb568 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -105,6 +105,50 @@ void MergeUVPlane(const uint8_t* src_u, int width, int height); +// Split interleaved msb UV plane into separate lsb U and V planes. +LIBYUV_API +void SplitUVPlane_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth); + +// Merge separate lsb U and V planes into one interleaved msb UV plane. +LIBYUV_API +void MergeUVPlane_16(const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int depth); + +// Convert lsb plane to msb plane +LIBYUV_API +void ConvertToMSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth); + +// Convert msb plane to lsb plane +LIBYUV_API +void ConvertToLSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth); + // Scale U and V to half width and height and merge into interleaved UV plane. // width and height are source size, allowing odd sizes. // Use for converting I444 or I422 to NV12. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 12233856..68fb88b3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -305,6 +305,7 @@ extern "C" { #define HAS_ARGBTORGB24ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 #define HAS_MERGEARGBROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 @@ -318,6 +319,7 @@ extern "C" { #define HAS_MULTIPLYROW_16_AVX2 #define HAS_RGBATOYJROW_AVX2 #define HAS_SPLITARGBROW_AVX2 +#define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 // TODO(fbarchard): Fix AVX2 version of YUV24 // #define HAS_NV21TOYUV24ROW_AVX2 @@ -363,6 +365,7 @@ extern "C" { #define HAS_BGRATOYROW_NEON #define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON +#define HAS_DIVIDEROW_16_NEON #define HAS_HALFFLOATROW_NEON #define HAS_HALFMERGEUVROW_NEON #define HAS_I400TOARGBROW_NEON @@ -380,9 +383,11 @@ extern "C" { #define HAS_J400TOARGBROW_NEON #define HAS_MERGEARGBROW_NEON #define HAS_MERGEUVROW_NEON +#define HAS_MERGEUVROW_16_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON #define HAS_MIRRORSPLITUVROW_NEON +#define HAS_MULTIPLYROW_16_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB565ROW_NEON @@ -409,6 +414,7 @@ extern "C" { #define HAS_SPLITARGBROW_NEON #define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON +#define HAS_SPLITUVROW_16_NEON #define HAS_SWAPUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON @@ -2010,22 +2016,96 @@ void SplitXRGBRow_Any_NEON(const uint8_t* src_argb, void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, /* 64 for 10 bit */ + int depth, int width); void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width); +void MergeUVRow_16_Any_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width); +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width); +void MergeUVRow_16_Any_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width); + +void SplitUVRow_16_C(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width); +void SplitUVRow_16_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width); +void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width); +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width); +void SplitUVRow_16_Any_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width); -void MultiplyRow_16_AVX2(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); void MultiplyRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_Any_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_Any_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); + +void DivideRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void DivideRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void DivideRow_16_Any_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void DivideRow_16_Any_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index ff6531bd..e59b316a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1778 +#define LIBYUV_VERSION 1779 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h index 6e408eda..0da3fb55 100644 --- a/include/libyuv/video_common.h +++ b/include/libyuv/video_common.h @@ -60,7 +60,7 @@ enum FourCC { FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 - FOURCC_I210 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 422 + FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422 // 1 Secondary YUV format: row biplanar. deprecated. FOURCC_M420 = FOURCC('M', '4', '2', '0'), @@ -109,6 +109,8 @@ enum FourCC { FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422 FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422 FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422 + FOURCC_P010 = FOURCC('P', '0', '1', '0'), + FOURCC_P210 = FOURCC('P', '2', '1', '0'), // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. @@ -178,7 +180,12 @@ enum FourCCBpp { FOURCC_BPP_J400 = 8, FOURCC_BPP_H420 = 12, FOURCC_BPP_H422 = 16, - FOURCC_BPP_H010 = 24, + FOURCC_BPP_I010 = 15, + FOURCC_BPP_I210 = 20, + FOURCC_BPP_H010 = 15, + FOURCC_BPP_H210 = 20, + FOURCC_BPP_P010 = 15, + FOURCC_BPP_P210 = 20, FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_H264 = 0, FOURCC_BPP_IYUV = 12, diff --git a/source/convert.cc b/source/convert.cc index b0314df4..1bd59659 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -149,6 +149,52 @@ int I010Copy(const uint16_t* src_y, return 0; } +static int Planar16bitTo8bit(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + int scale = 1 << (24 - depth); + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = -uv_height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (uv_height - 1) * src_stride_u; + src_v = src_v + (uv_height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width, + uv_height); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width, + uv_height); + return 0; +} + // Convert 10 bit YUV to 8 bit. LIBYUV_API int I010ToI420(const uint16_t* src_y, @@ -165,34 +211,295 @@ int I010ToI420(const uint16_t* src_y, int dst_stride_v, int width, int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 10); +} + +LIBYUV_API +int I210ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 10); +} + +LIBYUV_API +int I410ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 10); +} + +LIBYUV_API +int I012ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 12); +} + +LIBYUV_API +int I212ToI422(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 0, 12); +} + +LIBYUV_API +int I412ToI444(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 0, + 0, 12); +} + +// Any Ix10 To I010 format with mirroring. +static int Ix10ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y) { + const int dst_y_width = Abs(width); + const int dst_y_height = Abs(height); + const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; + if (dst_y) { + ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + dst_y_width, dst_y_height, kFilterBilinear); } + ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} - // Convert Y plane. - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, - height); - // Convert UV planes. - Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, - halfheight); - Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, - halfheight); +LIBYUV_API +int I410ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 0, 0); +} + +LIBYUV_API +int I210ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 1, 0); +} + +// Any I[420]1[02] to P[420]1[02] format with mirroring. +static int Ix1xToPx1x(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if (width <= 0 || height == 0) { + return -1; + } + + ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + depth); + MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, uv_width, uv_height, depth); return 0; } +LIBYUV_API +int I010ToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 10); +} + +LIBYUV_API +int I210ToP210(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 10); +} + +LIBYUV_API +int I012ToP012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 1, 12); +} + +LIBYUV_API +int I212ToP212(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, + width, height, 1, 0, 12); +} + // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API diff --git a/source/convert_from.cc b/source/convert_from.cc index 591e2782..687f0a72 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -111,6 +111,50 @@ int I420ToI010(const uint8_t* src_y, return 0; } +// Convert 8 bit YUV to 12 bit. +LIBYUV_API +int I420ToI012(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth, + halfheight); + return 0; +} + // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 069be7fd..219c2165 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -550,6 +550,216 @@ void MergeUVPlane(const uint8_t* src_u, } } +// Support function for P010 etc UV channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitUVPlane_16(const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + int y; + int scale = 1 << depth; + void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, + int scale, int width) = SplitUVRow_16_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_uv == width * 2 && dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SplitUVRow = SplitUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, scale, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } +} + +LIBYUV_API +void MergeUVPlane_16(const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height, + int depth) { + int y; + int scale = 1 << (16 - depth); + void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v, + uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uv = dst_uv + (height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + // Coalesce rows. + if (src_stride_u == width && src_stride_v == width && + dst_stride_uv == width * 2) { + width *= height; + height = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_16_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_16_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_16_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeUVRow = MergeUVRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow(src_u, src_v, dst_uv, scale, width); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } +} + +// Convert plane from lsb to msb +LIBYUV_API +void ConvertToMSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << (16 - depth); + void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = MultiplyRow_16_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_MULTIPLYROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MultiplyRow = MultiplyRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MultiplyRow = MultiplyRow_16_AVX2; + } + } +#endif +#if defined(HAS_MULTIPLYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MultiplyRow = MultiplyRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MultiplyRow = MultiplyRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MultiplyRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert plane from msb to lsb +LIBYUV_API +void ConvertToLSBPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int depth) { + int y; + int scale = 1 << depth; + void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = DivideRow_16_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + +#if defined(HAS_DIVIDEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + DivideRow = DivideRow_16_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + DivideRow = DivideRow_16_AVX2; + } + } +#endif +#if defined(HAS_DIVIDEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DivideRow = DivideRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DivideRow = DivideRow_16_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + DivideRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + // Swap U and V channels in interleaved UV plane. LIBYUV_API void SwapUVPlane(const uint8_t* src_uv, diff --git a/source/row_any.cc b/source/row_any.cc index 57c39d5d..08ae1d2a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -546,6 +546,32 @@ ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7) #endif #undef ANY21C +// Any 2 16 bit planes with parameter to 1 +#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + memset(temp, 0, 16 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ + } \ + memcpy(temp, src_u + n, r * BPP); \ + memcpy(temp + 16, src_v + n, r * BPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \ + memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \ + } + +#ifdef HAS_MERGEUVROW_16_AVX2 +ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_16_NEON +ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + // Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ @@ -1126,6 +1152,30 @@ ANY11C(Convert8To16Row_Any_AVX2, uint16_t, 31) #endif +#ifdef HAS_MULTIPLYROW_16_AVX2 +ANY11C(MultiplyRow_16_Any_AVX2, + MultiplyRow_16_AVX2, + 2, + 2, + uint16_t, + uint16_t, + 31) +#endif +#ifdef HAS_MULTIPLYROW_16_NEON +ANY11C(MultiplyRow_16_Any_NEON, + MultiplyRow_16_NEON, + 2, + 2, + uint16_t, + uint16_t, + 15) +#endif +#ifdef HAS_DIVIDEROW_16_AVX2 +ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) +#endif +#ifdef HAS_DIVIDEROW_16_NEON +ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) +#endif #undef ANY11C // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. @@ -1405,6 +1455,32 @@ ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) #endif #undef ANY12 +// Any 2 16 bit planes with parameter to 1 +#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ + } \ + memcpy(temp, src_uv + n * 2, r * BPP * 2); \ + ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \ + memcpy(dst_u + n, temp + 32, r * BPP); \ + memcpy(dst_v + n, temp + 48, r * BPP); \ + } + +#ifdef HAS_SPLITUVROW_16_AVX2 +ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15) +#endif + +#ifdef HAS_SPLITUVROW_16_NEON +ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) +#endif + +#undef ANY21CT + // Any 1 to 3. Outputs RGB planes. #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ diff --git a/source/row_common.cc b/source/row_common.cc index eb889c83..a941c3f5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2521,27 +2521,33 @@ void MergeXRGBRow_C(const uint8_t* src_r, } } -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits +// Convert lsb formats to msb, depending on sample depth. void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width) { + int shift = 16 - depth; int x; - for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = src_u[x] * scale; - dst_uv[1] = src_v[x] * scale; - dst_uv[2] = src_u[x + 1] * scale; - dst_uv[3] = src_v[x + 1] * scale; - dst_uv += 4; + for (x = 0; x < width; ++x) { + dst_uv[0] = src_u[x] << shift; + dst_uv[1] = src_v[x] << shift; + dst_uv += 2; } - if (width & 1) { - dst_uv[0] = src_u[width - 1] * scale; - dst_uv[1] = src_v[width - 1] * scale; +} + +// Convert msb formats to lsb, depending on sample depth. +void SplitUVRow_16_C(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + int shift = 16 - depth; + int x; + for (x = 0; x < width; ++x) { + dst_u[x] = src_uv[0] >> shift; + dst_v[x] = src_uv[1] >> shift; + src_uv += 2; } } @@ -2555,6 +2561,16 @@ void MultiplyRow_16_C(const uint16_t* src_y, } } +void DivideRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits diff --git a/source/row_gcc.cc b/source/row_gcc.cc index cf87d46e..faf0fc91 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3653,22 +3653,18 @@ void MergeUVRow_SSE2(const uint8_t* src_u, } #endif // HAS_MERGEUVROW_SSE2 -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width) { + depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" + "vbroadcastss %%xmm3,%%xmm3 \n" "sub %0,%1 \n" // 16 pixels per loop. @@ -3678,8 +3674,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "vmovdqu (%0,%1,1),%%ymm1 \n" "add $0x20,%0 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" @@ -3694,12 +3690,62 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 - : "r"(scale) // %4 + : "r"(depth) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 +#ifdef HAS_MERGEUVROW_16_AVX2 +const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15}; +void SplitUVRow_16_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + depth = 16 - depth; + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width), // %3 + "+r"(depth) // %4 + : + "m"(kSplitUVShuffle16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits // 64 = 10 bits @@ -3717,7 +3763,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" - // 16 pixels per loop. + // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" @@ -3739,6 +3785,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, } #endif // HAS_MULTIPLYROW_16_AVX2 +// Use scale to convert msb formats to lsb, depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// 65536 = 16 bits +#ifdef HAS_DIVIDEROW_16_AVX2 +void DivideRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width), // %2 + "+r"(scale) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits diff --git a/source/row_neon.cc b/source/row_neon.cc index e54cb12b..43a2cac7 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3166,6 +3166,121 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, : "cc", "memory", "q0", "q1", "q2", "q3"); } +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + asm volatile( + "vdup.32 q0, %3 \n" + "1: \n" + "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV + "vmovl.u16 q3, d2 \n" + "vmovl.u16 q4, d3 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vmovn.u32 d2, q3 \n" + "vmovn.u32 d3, q4 \n" + "vmovl.u16 q3, d4 \n" + "vmovl.u16 q4, d5 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vmovn.u32 d4, q3 \n" + "vmovn.u32 d5, q4 \n" + "subs %4, %4, #8 \n" // 8 src pixels per loop + "vst1.16 {q1}, [%1]! \n" // store 8 U pixels + "vst1.16 {q2}, [%2]! \n" // store 8 V pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(depth), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "vdup.16 q2, %3 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" // load 8 U + "vld1.16 {q1}, [%1]! \n" // load 8 V + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %4, %4, #8 \n" // 8 src pixels per loop + "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(shift), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q2, %2 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vmul.u16 q0, q0, q2 \n" + "vmul.u16 q1, q1, q2 \n" + "vst1.16 {q0}, [%1]! \n" + "vst1.16 {q1}, [%1]! \n" + "subs %3, %3, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q0, %2 \n" + "1: \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vmovl.u16 q3, d2 \n" + "vmovl.u16 q1, d3 \n" + "vmovl.u16 q4, d4 \n" + "vmovl.u16 q2, d5 \n" + "vshl.u32 q3, q3, q0 \n" + "vshl.u32 q4, q4, q0 \n" + "vshl.u32 q1, q1, q0 \n" + "vshl.u32 q2, q2, q0 \n" + "vmovn.u32 d2, q3 \n" + "vmovn.u32 d3, q1 \n" + "vmovn.u32 d4, q4 \n" + "vmovn.u32 d5, q2 \n" + "vst1.16 {q1}, [%1]! \n" + "vst1.16 {q2}, [%1]! \n" + "subs %3, %3, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/source/row_neon64.cc b/source/row_neon64.cc index acefd96d..941c9b98 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3526,6 +3526,126 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, : "cc", "memory", "v0", "v1", "v2", "v3"); } +void SplitUVRow_16_NEON(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + asm volatile( + "dup v0.4s, %w3 \n" + "1: \n" + "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV + "prfm pldl1keep, [%0, 448] \n" + "ushll v3.4s, v1.4h, #0 \n" + "ushll2 v4.4s, v1.8h, #0 \n" + "ushl v3.4s, v3.4s, v0.4s \n" + "ushl v4.4s, v4.4s, v0.4s \n" + "xtn v1.4h, v3.4s \n" + "xtn2 v1.8h, v4.4s \n" + "ushll v3.4s, v2.4h, #0 \n" + "ushll2 v4.4s, v2.8h, #0 \n" + "ushl v3.4s, v3.4s, v0.4s \n" + "ushl v4.4s, v4.4s, v0.4s \n" + "xtn v2.4h, v3.4s \n" + "xtn2 v2.8h, v4.4s \n" + "subs %w4, %w4, #8 \n" // 8 src pixels per loop + "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(depth), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void MergeUVRow_16_NEON(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width) { + int shift = 16 - depth; + asm volatile( + "dup v2.8h, %w3 \n" + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // load 8 U + "prfm pldl1keep, [%0, 448] \n" + "ld1 {v1.8h}, [%1], #16 \n" // load 8 V + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v2.8h \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "subs %w4, %w4, #8 \n" // 8 src pixels per loop + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(shift), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void MultiplyRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v2.8h, %w2 \n" + "1: \n" + "ldp q0, q1, [%0] \n" + "add %0, %0, #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "mul v0.8h, v0.8h, v2.8h \n" + "mul v1.8h, v1.8h, v2.8h \n" + "stp q0, q1, [%1] \n" // store 16 pixels + "add %1, %1, #32 \n" + "subs %w3, %w3, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void DivideRow_16_NEON(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v0.8h, %w2 \n" + "1: \n" + "ldp q1, q2, [%0] \n" + "add %0, %0, #32 \n" + "prfm pldl1keep, [%0, 448] \n" + "ushll v3.4s, v1.4h, #0 \n" + "ushll v4.4s, v2.4h, #0 \n" + "ushll2 v1.4s, v1.8h, #0 \n" + "ushll2 v2.4s, v2.8h, #0 \n" + "mul v3.4s, v0.4s, v3.4s \n" + "mul v4.4s, v0.4s, v4.4s \n" + "mul v1.4s, v0.4s, v1.4s \n" + "mul v2.4s, v0.4s, v2.4s \n" + "shrn v3.4h, v3.4s, #16 \n" + "shrn v4.4h, v4.4s, #16 \n" + "shrn2 v3.8h, v1.4s, #16 \n" + "shrn2 v4.8h, v2.4s, #16 \n" + "stp q3, q3, [%1] \n" // store 16 pixels + "add %1, %1, #32 \n" + "subs %w3, %w3, #16 \n" // 16 src pixels per loop + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(scale), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 50593160..8638a84c 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -158,15 +158,26 @@ TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8) TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8) TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10) -TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8) +TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8) +TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12) TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12) +TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10) +TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10) +TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12) +TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12) +TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10) +TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10) +TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10) +TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12) +TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12) +TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) // Test Android 420 to I420 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ @@ -292,63 +303,74 @@ int I400ToNV21(const uint8_t* src_y, dst_stride_vu, width, height); } -#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ +#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ + SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ + static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ + static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ + "SRC_SUBSAMP_X unsupported"); \ + static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ + "SRC_SUBSAMP_Y unsupported"); \ + static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ + "DST_SUBSAMP_X unsupported"); \ + static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ + "DST_SUBSAMP_Y unsupported"); \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ + const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ + const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_u, \ + kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_v, \ + kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_c, \ + kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_opt, \ + kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ + MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ + MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ + MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ + SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \ + SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \ + SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_uv_c, 2, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_uv_opt, 102, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \ + src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ + src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ + } \ + memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ + memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ - dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth, \ + src_v_p, kSrcHalfWidth, \ + reinterpret_cast<DST_T*>(dst_y_c), kWidth, \ + reinterpret_cast<DST_T*>(dst_uv_c), \ + kDstHalfWidth * 2, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ - dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ + src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ + reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \ + reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth, \ + NEG kHeight); \ } \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ - } \ + for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ + EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ - EXPECT_EQ(dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j], \ - dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]); \ - } \ + for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) { \ + EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ @@ -359,23 +381,33 @@ int I400ToNV21(const uint8_t* src_y, free_aligned_buffer_page_end(src_v); \ } -#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ - TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) - -TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2) -TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) -TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2) -TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2) -TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2) -TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) +#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ + SRC_DEPTH) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH) + +TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8) +TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10) +TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10) +TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12) +TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) #define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ @@ -385,13 +417,13 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ - "SRC_SUBSAMP_X unsupported"); \ + "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ - "SRC_SUBSAMP_Y unsupported"); \ + "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ - "DST_SUBSAMP_X unsupported"); \ + "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ - "DST_SUBSAMP_Y unsupported"); \ + "DST_SUBSAMP_Y unsupported"); \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ @@ -407,15 +439,15 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_opt, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ - MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \ SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ + src_y_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ - for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) { \ - src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1); \ + for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ + src_uv_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ @@ -483,112 +515,111 @@ TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8) TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8) TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8) -// These formats put data in high bits, so test on full 16bit range. -TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16) -TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16) -TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16) -TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16) -TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16) -TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16) - -#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ - DOY) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_u_c, 2, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_u_opt, 102, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \ - dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \ - kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ - } \ - if (DOY) { \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ - } \ - } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ - dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ - } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ - dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ - } \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ +TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10) +TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10) +TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10) +TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12) +TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12) +TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12) + +#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ + SRC_DEPTH) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ + static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ + static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ + "SRC_SUBSAMP_X unsupported"); \ + static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ + "SRC_SUBSAMP_Y unsupported"); \ + static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ + "DST_SUBSAMP_X unsupported"); \ + static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ + "DST_SUBSAMP_Y unsupported"); \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ + const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ + const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_uv, \ + kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \ + SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ + src_uv_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ + reinterpret_cast<DST_T*>(dst_y_c), kWidth, \ + reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \ + reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ + reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \ + reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \ + reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \ + NEG kHeight); \ + } \ + for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ + EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ + } \ + for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \ + EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \ + EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \ + } \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ } -#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ - 1) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) - -TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2) -TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2) +#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ + SRC_DEPTH) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH) + +TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) +TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) // Provide matrix wrappers for full range bt.709 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \ diff --git a/unit_test/video_common_test.cc b/unit_test/video_common_test.cc index eb183aaa..6c6a384d 100644 --- a/unit_test/video_common_test.cc +++ b/unit_test/video_common_test.cc @@ -81,6 +81,11 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420)); EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422)); EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010)); + EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210)); + EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010)); + EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210)); + EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010)); + EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210)); EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG)); EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12)); EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16)); |