diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-04 12:33:02 -0800 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2021-03-05 01:09:37 +0000 |
commit | ba033a11e3948e4b361e0414caa57f793584b46e (patch) | |
tree | 1037b49cad50b9564db77c505aec9740f2bc88f6 | |
parent | 95ff456c3335c2c541e2bc5038a2b01eea08cd33 (diff) | |
download | libyuv-ba033a11e3948e4b361e0414caa57f793584b46e.tar.gz |
Add 12 bit YUV to 10 bit RGB
Bug: libyuv:843
Change-Id: I0104c8fcaeed09e83d2fd654c6a5e7d41bcb74cf
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2727775
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
-rw-r--r-- | docs/formats.md | 25 | ||||
-rw-r--r-- | include/libyuv/convert_argb.h | 28 | ||||
-rw-r--r-- | include/libyuv/row.h | 111 | ||||
-rw-r--r-- | include/libyuv/scale_row.h | 16 | ||||
-rw-r--r-- | source/convert_argb.cc | 140 | ||||
-rw-r--r-- | source/row_any.cc | 48 | ||||
-rw-r--r-- | source/row_common.cc | 439 | ||||
-rw-r--r-- | source/row_gcc.cc | 623 | ||||
-rw-r--r-- | source/scale.cc | 2 | ||||
-rw-r--r-- | unit_test/convert_test.cc | 200 | ||||
-rw-r--r-- | unit_test/scale_argb_test.cc | 2 | ||||
-rw-r--r-- | unit_test/scale_test.cc | 2 | ||||
-rw-r--r-- | unit_test/scale_uv_test.cc | 2 | ||||
-rw-r--r-- | unit_test/unit_test.cc | 8 |
14 files changed, 1128 insertions, 518 deletions
diff --git a/docs/formats.md b/docs/formats.md index a29ed5c3..5fc19d45 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -4,7 +4,9 @@ Formats (FOURCC) supported by libyuv are detailed here. # Core Formats -There are 2 core formats supported by libyuv - I420 and ARGB. All YUV formats can be converted to/from I420. All RGB formats can be converted to/from ARGB. +There are 2 core formats supported by libyuv - I420 and ARGB. + All YUV formats can be converted to/from I420. + All RGB formats can be converted to/from ARGB. Filtering functions such as scaling and planar functions work on I420 and/or ARGB. @@ -109,6 +111,27 @@ The following is extracted from video_common.h as a complete list of formats sup I444, NV24 and NV42 are full width, full height I400 and J400 have no chroma channel. +# Color space + The YUV formats start with a letter to specify the color space. e.g. I420 + I = BT.601 limited range + J = BT.601 full range (J = JPEG that uses this) + H = BT.709 limited range (H for HD) + F = BT.709 full range (F for Full range) + U = BT.2020 limited range (U for UHD) + V = BT.2020 full range + For YUV to RGB conversions, a matrix can be passed. See also convert_argh.h + +# HDR formats + Planar formats with 10 or 12 bits use the following fourcc: + I010, I012, P010, P012 are half width, half height + I210, I212, P210, P212 are half width, full height + I410, I412, P410, P412 are full width, full height + where + I is the color space (see above) and 3 planes: Y, U and V. + P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits. There is a Y plane and a UV plane. + 0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4 + 10 or 12 is the bits per channel. The bits are in the low bits of a 16 bit channel. + # The ARGB FOURCC There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers. diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 419e7430..474a8214 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -1488,6 +1488,34 @@ int I010ToARGBMatrix(const uint16_t* src_y, int width, int height); +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 76536314..98514f46 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -282,6 +282,8 @@ extern "C" { #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 +#define HAS_I212TOAR30ROW_SSSE3 +#define HAS_I212TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 #define HAS_I410TOAR30ROW_SSSE3 @@ -320,6 +322,8 @@ extern "C" { #define HAS_MERGEARGBROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I212TOAR30ROW_AVX2 +#define HAS_I212TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 #define HAS_I410TOAR30ROW_AVX2 #define HAS_I410TOARGBROW_AVX2 @@ -721,9 +725,15 @@ struct YuvConstants { #else // This struct is for Intel color conversion. struct YuvConstants { +#if LIBYUV_UNLIMITED_DATA + uint8_t kUVToB[32]; + uint8_t kUVToG[32]; + uint8_t kUVToR[32]; +#else int8_t kUVToB[32]; int8_t kUVToG[32]; int8_t kUVToR[32]; +#endif int16_t kUVBiasB[16]; int16_t kUVBiasG[16]; int16_t kUVBiasR[16]; @@ -2040,10 +2050,10 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, int depth, int width); void MergeUVRow_16_Any_AVX2(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width); void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, @@ -2591,6 +2601,18 @@ void I210ToARGBRow_C(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -2617,7 +2639,6 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); - void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -2769,6 +2790,18 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I410ToAR30Row_SSSE3(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -2813,6 +2846,18 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I410ToAR30Row_AVX2(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -3081,6 +3126,18 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I410ToAR30Row_Any_SSSE3(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -3125,6 +3182,18 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I410ToAR30Row_Any_AVX2(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -3788,25 +3857,25 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void P410ToARGBRow_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void P210ToAR30Row_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void P410ToAR30Row_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 9ad51a56..86a2cf08 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -626,13 +626,13 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); + uint16_t* dst_ptr, + int dst_width); void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -682,8 +682,8 @@ void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); + uint16_t* dst_ptr, + int dst_width); void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index eb185b6e..2b3d52d2 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -888,6 +888,63 @@ int U010ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } +// Convert 12 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I212TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToAR30Row = I212ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToAR30Row = I212ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I212TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToAR30Row = I212ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToAR30Row = I212ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + // Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. @@ -1061,7 +1118,7 @@ int I410ToAR30Matrix(const uint16_t* src_y, void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = - I410ToAR30Row_C; + I410ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1260,6 +1317,61 @@ int U010ToABGR(const uint16_t* src_y, width, height); } +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I212TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToARGBRow = I212ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToARGBRow = I212ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I212TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToARGBRow = I212ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToARGBRow = I212ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, @@ -1437,7 +1549,7 @@ int I410ToARGBMatrix(const uint16_t* src_y, void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = - I410ToARGBRow_C; + I410ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1484,9 +1596,9 @@ int P010ToARGBMatrix(const uint16_t* src_y, int width, int height) { int y; - void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - P210ToARGBRow_C; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1534,9 +1646,9 @@ int P210ToARGBMatrix(const uint16_t* src_y, int width, int height) { int y; - void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - P210ToARGBRow_C; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1582,9 +1694,9 @@ int P010ToAR30Matrix(const uint16_t* src_y, int width, int height) { int y; - void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - P210ToAR30Row_C; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1632,9 +1744,9 @@ int P210ToAR30Matrix(const uint16_t* src_y, int width, int height) { int y; - void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - P210ToAR30Row_C; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } diff --git a/source/row_any.cc b/source/row_any.cc index bcb59ea7..f68d2ed6 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -138,19 +138,47 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7) } #ifdef HAS_I210ALPHATOARGBROW_SSSE3 -ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +ANY41CT(I210AlphaToARGBRow_Any_SSSE3, + I210AlphaToARGBRow_SSSE3, + 1, + 0, + uint16_t, + 2, + 4, + 7) #endif #ifdef HAS_I210ALPHATOARGBROW_AVX2 -ANY41CT(I210AlphaToARGBRow_Any_AVX2, I210AlphaToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +ANY41CT(I210AlphaToARGBRow_Any_AVX2, + I210AlphaToARGBRow_AVX2, + 1, + 0, + uint16_t, + 2, + 4, + 15) #endif #ifdef HAS_I410ALPHATOARGBROW_SSSE3 -ANY41CT(I410AlphaToARGBRow_Any_SSSE3, I410AlphaToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +ANY41CT(I410AlphaToARGBRow_Any_SSSE3, + I410AlphaToARGBRow_SSSE3, + 0, + 0, + uint16_t, + 2, + 4, + 7) #endif #ifdef HAS_I410ALPHATOARGBROW_AVX2 -ANY41CT(I410AlphaToARGBRow_Any_AVX2, I410AlphaToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +ANY41CT(I410AlphaToARGBRow_Any_AVX2, + I410AlphaToARGBRow_AVX2, + 0, + 0, + uint16_t, + 2, + 4, + 15) #endif #undef ANY41CT @@ -382,6 +410,18 @@ ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #ifdef HAS_I210TOARGBROW_MMI ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7) #endif +#ifdef HAS_I212TOAR30ROW_SSSE3 +ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_SSSE3 +ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_AVX2 +ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_AVX2 +ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif #undef ANY31CT // Any 2 planes to 1. diff --git a/source/row_common.cc b/source/row_common.cc index ad4e95ea..d959ccd1 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -21,6 +21,11 @@ namespace libyuv { extern "C" { #endif +// These 2 macros control YUV to RGB using unsigned math to extend range. +// They can be used separately to enable new code and old data (clamped) +// LIBYUV_UNLIMITED_DATA +// LIBYUV_UNLIMITED_CODE + // The following ifdef from row_win makes the C code match the row_win code, // which is 7 bit fixed point. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ @@ -1395,7 +1400,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. +#if LIBYUV_UNLIMITED_DATA +#define UB 129 /* round(2.018 * 64) */ +#else #define UB 128 /* max(128, round(2.018 * 64)) */ +#endif #define UG 25 /* round(0.391 * 64) */ #define VG 52 /* round(0.813 * 64) */ #define VR 102 /* round(1.596 * 64) */ @@ -1444,9 +1453,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) // B = (Y - 16) * 1.164 + U * 2.112 // KR = 0.2126, KB = 0.0722 -// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. +#if LIBYUV_UNLIMITED_DATA +#define UB 135 /* round(2.112 * 64) */ +#else #define UB 128 /* max(128, round(2.112 * 64)) */ +#endif #define UG 14 /* round(0.213 * 64) */ #define VG 34 /* round(0.533 * 64) */ #define VR 115 /* round(1.793 * 64) */ @@ -1495,9 +1507,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) // B = (Y - 16) * 1.164384 + U * 2.14177 // KR = 0.2627; KB = 0.0593 -// TODO(fbarchard): Improve accuracy; the B channel is off by 7%. // U and V contributions to R,G,B. +#if LIBYUV_UNLIMITED_DATA +#define UB 137 /* round(2.142 * 64) */ +#else #define UB 128 /* max(128, round(2.142 * 64)) */ +#endif #define UG 12 /* round(0.187326 * 64) */ #define VG 42 /* round(0.65042 * 64) */ #define VR 107 /* round(1.67867 * 64) */ @@ -1545,15 +1560,61 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #undef MAKEYUVCONSTANTS +#if LIBYUV_UNLIMITED_DATA + +// C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#elif defined(__arm__) + int ub = yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ub = -yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32_t)(y1 + (u * ub) + bb) >> 6); + *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6); + *r = Clamp((int32_t)(y1 + (v * vr) + br) >> 6); +} +#else // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 8 bit. -static __inline void YuvPixel8_8(uint8_t y, - uint8_t u, - uint8_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1584,10 +1645,11 @@ static __inline void YuvPixel8_8(uint8_t y, #endif uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6); - *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6); - *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6); + *b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6); + *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6); + *r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6); } +#endif // Reads 8 bit YUV and leaves result as 16 bit. static __inline void YuvPixel8_16(uint8_t y, @@ -1627,9 +1689,9 @@ static __inline void YuvPixel8_16(uint8_t y, #endif uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + *b = (int)(y1 - (u * ub) + bb); + *g = (int)(y1 - (u * ug + v * vg) + bg); + *r = (int)(y1 - (v * vr) + br); } // C reference code that mimics the YUV 16 bit assembly. @@ -1678,15 +1740,61 @@ static __inline void YuvPixel10_16(uint16_t y, *r = (int)(-(v * vr) + y1 + br); } +// C reference code that mimics the YUV 16 bit assembly. +// Reads 12 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel12_16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16; + u = clamp255(u >> 4); + v = clamp255(v >> 4); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + // C reference code that mimics the YUV 10 bit assembly. // Reads 10 bit YUV and clamps down to 8 bit RGB. -static __inline void YuvPixel10_8(uint16_t y, - uint16_t u, - uint16_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { int b16; int g16; int r16; @@ -1696,6 +1804,24 @@ static __inline void YuvPixel10_8(uint16_t y, *r = Clamp(r16 >> 6); } +// C reference code that mimics the YUV 12 bit assembly. +// Reads 12 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel12(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); +} + // C reference code that mimics the YUV 16 bit assembly. // Reads 16 bit YUV and leaves result as 8 bit. static __inline void YuvPixel16_8(uint16_t y, @@ -1783,9 +1909,9 @@ static __inline void YuvPixel16_16(uint16_t y, uint32_t y1 = (uint32_t)(y * yg) >> 16; u = clamp255(u >> 8); v = clamp255(v >> 8); - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + *b = (int)(y1 + -(u * ub) + bb); + *g = (int)(y1 + -(u * ug + v * vg) + bg); + *r = (int)(y1 + -(v * vr) + br); } // C reference code that mimics the YUV assembly. @@ -1822,11 +1948,11 @@ void I444ToARGBRow_C(const uint8_t* src_y, for (x = 0; x < width - 1; x += 2) { uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 2; @@ -1834,8 +1960,8 @@ void I444ToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1848,8 +1974,8 @@ void I444ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1868,11 +1994,11 @@ void I422ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1880,8 +2006,8 @@ void I422ToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1895,11 +2021,11 @@ void I210ToARGBRow_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1907,8 +2033,8 @@ void I210ToARGBRow_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1921,8 +2047,8 @@ void I410ToARGBRow_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1940,11 +2066,11 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); - YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = clamp255(src_a[1] >> 2); src_y += 2; src_u += 1; @@ -1953,8 +2079,8 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); } } @@ -1968,8 +2094,8 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); src_y += 1; src_u += 1; @@ -1979,6 +2105,33 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y, } } +// 12 bit YUV to ARGB +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { uint32_t ar30; b = b >> 4; // convert 8 bit 10.6 to 10 bit. @@ -2018,6 +2171,33 @@ void I210ToAR30Row_C(const uint16_t* src_y, } } +// 12 bit YUV to 10 bit AR30 +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + void I410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -2038,6 +2218,7 @@ void I410ToAR30Row_C(const uint16_t* src_y, } } +// P210 has 10 bits in msb of 16 bit NV12 style layout. void P210ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* rgb_buf, @@ -2163,11 +2344,11 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, for (x = 0; x < width - 1; x += 2) { uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 2; @@ -2176,8 +2357,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } @@ -2191,8 +2372,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; src_y += 1; src_u += 1; @@ -2212,11 +2393,11 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; @@ -2225,8 +2406,8 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } @@ -2239,18 +2420,18 @@ void I422ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2268,8 +2449,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -2284,7 +2465,7 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, dst_argb4444 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -2306,8 +2487,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -2322,7 +2503,7 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, dst_argb1555 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -2344,8 +2525,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2360,7 +2541,7 @@ void I422ToRGB565Row_C(const uint8_t* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2375,19 +2556,19 @@ void NV12ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2399,19 +2580,19 @@ void NV21ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2423,17 +2604,17 @@ void NV12ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_uv += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2444,17 +2625,17 @@ void NV21ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_vu += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2471,8 +2652,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); - YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2486,7 +2667,7 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2500,18 +2681,18 @@ void YUY2ToARGBRow_C(const uint8_t* src_yuy2, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2522,18 +2703,18 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel8_8(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2546,11 +2727,11 @@ void I422ToRGBARow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; - YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, - rgb_buf + 7, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -2558,8 +2739,8 @@ void I422ToRGBARow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 2c823a13..f4d9978b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2001,6 +2001,19 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, "packuswb %%xmm5,%%xmm5 \n" \ "lea 0x10(%[a_buf]),%[a_buf] \n" +// Read 4 UV from 422 12 bit, upsample to 8 UV +#define READYUV212 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $0x4,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x4,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ "movd (%[u_buf]),%%xmm0 \n" \ @@ -2398,6 +2411,36 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, ); } +// 12 bit YUV to ARGB +void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + // 10 bit YUV to AR30 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, @@ -2433,6 +2476,41 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, ); } +// 12 bit YUV to AR30 +void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + // 10 bit YUV to ARGB void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, @@ -2443,16 +2521,16 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2474,29 +2552,26 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, int width) { asm volatile( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA210 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA210 + YUVTORGB(yuvconstants) STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } #endif @@ -2511,29 +2586,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, int width) { asm volatile( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA410 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA410 + YUVTORGB(yuvconstants) STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } #endif @@ -2547,21 +2619,21 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV410 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2729,26 +2801,22 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READP210 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN "1: \n" READP210 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, @@ -2756,25 +2824,22 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READP410 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + LABELALIGN "1: \n" READP410 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, @@ -2785,20 +2850,20 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READP210 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -2817,20 +2882,20 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min - "psrlw $6,%%xmm7 \n" // 1023 for max + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READP410 YUVTORGB16(yuvconstants) STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -2948,6 +3013,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" +// Read 8 UV from 212 12 bit, upsample to 16 UV +#define READYUV212_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + // Read 16 UV from 410. With 16 Alpha. #define READYUVA410_AVX2 \ "vmovdqu (%[u_buf]),%%ymm0 \n" \ @@ -3295,6 +3375,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOARGBROW_AVX2 +#if defined(HAS_I212TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I212TOARGBROW_AVX2 + #if defined(HAS_I210TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). @@ -3335,6 +3450,46 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOAR30ROW_AVX2 +#if defined(HAS_I212TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I212TOAR30ROW_AVX2 + #if defined(HAS_I410TOARGBROW_AVX2) // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). @@ -3347,17 +3502,17 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -3383,32 +3538,28 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, int width) { asm volatile( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA210_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + LABELALIGN "1: \n" READYUVA210_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5" - ); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I210TOARGBROW_AVX2 @@ -3424,32 +3575,28 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, int width) { asm volatile( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA410_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + LABELALIGN "1: \n" READYUVA410_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5" - ); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I410TOARGBROW_AVX2 @@ -3465,23 +3612,23 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3764,16 +3911,16 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READP210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -3797,16 +3944,16 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READP410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] @@ -3830,22 +3977,22 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READP210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -3868,22 +4015,22 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READP410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" + "sub $0x10,%[width] \n" + "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -4409,33 +4556,33 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, depth = 16 - depth; // clang-format off asm volatile ( - "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%xmm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" - "sub %1,%2 \n" + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - - "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x0,%%ymm1,0x10(%1) \n" - "vextractf128 $0x1,%%ymm0,(%1,%2) \n" - "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" - "add $0x20,%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4499,24 +4646,24 @@ void DivideRow_16_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width), // %2 @@ -5173,7 +5320,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); @@ -5264,7 +5411,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit), // %6 "m"(kShuffleMaskARGBPermute) // %7 @@ -7981,7 +8128,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -8019,7 +8166,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } diff --git a/source/scale.cc b/source/scale.cc index 3ccd2111..a254737c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1619,7 +1619,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width, uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; + ScaleRowUp2_Bilinear_16_Any_C; int x; // This function can only scale up by 2 times. diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index de5cd00e..84584582 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -3167,67 +3167,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { } #endif // HAS_ABGRTOAR30ROW_AVX2 -// TODO(fbarchard): Fix clamping issue affected by U channel. -#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = \ - (fastrand() & ((1 << S_DEPTH) - 1)); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = \ - (fastrand() & ((1 << S_DEPTH) - 1)); \ - reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = \ - (fastrand() & ((1 << S_DEPTH) - 1)); \ - } \ - memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ - reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ - reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ - dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ - reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ - reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ - dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } - -#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, S_DEPTH) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) +// Provide matrix wrappers for 12 bit YUV +#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) @@ -3254,43 +3198,105 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { #define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) -TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(I410, 1, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(I410, 1, 1, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(H410, 1, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(H410, 1, 1, ABGR, 4, 4, 1, 10) -TESTPLANAR16TOB(U410, 1, 1, ARGB, 4, 4, 1, 10) -TESTPLANAR16TOB(U410, 1, 1, ABGR, 4, 4, 1, 10) +// TODO(fbarchard): Fix clamping issue affected by U channel. +#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ + BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \ + reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ + dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ + dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ + BPP_B, ALIGN, YALIGN) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0) + +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1) + #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(I410, 1, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(I410, 1, 1, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(H410, 1, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(H410, 1, 1, AB30, 4, 4, 1, 10) -TESTPLANAR16TOB(U410, 1, 1, AR30, 4, 4, 1, 10) -TESTPLANAR16TOB(U410, 1, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1) #endif #define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index ac976612..cfc12f3a 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -302,7 +302,7 @@ TEST_FACTOR(3, 1, 3) TEST_SCALETO1(name, width, height, Bilinear, 3) TEST_SCALETO(ARGBScale, 1, 1) -TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 066bcfde..6da6b574 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -1025,7 +1025,7 @@ TEST_FACTOR(3, 1, 3, 0) #endif TEST_SCALETO(Scale, 1, 1) -TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc index 59eeee30..6e4649f8 100644 --- a/unit_test/scale_uv_test.cc +++ b/unit_test/scale_uv_test.cc @@ -166,7 +166,7 @@ TEST_FACTOR(3, 1, 3) TEST_SCALETO1(name, width, height, Bilinear, 3) TEST_SCALETO(UVScale, 1, 1) -TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(UVScale, 320, 240) TEST_SCALETO(UVScale, 569, 480) TEST_SCALETO(UVScale, 640, 360) diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc index 85e3b717..e6dbc3ee 100644 --- a/unit_test/unit_test.cc +++ b/unit_test/unit_test.cc @@ -26,9 +26,13 @@ unsigned int fastrand_seed = 0xfb; ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image."); ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image."); ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test."); -ABSL_FLAG(int32_t, libyuv_flags, 0, +ABSL_FLAG(int32_t, + libyuv_flags, + 0, "cpu flags for reference code. 1 = C, -1 = SIMD"); -ABSL_FLAG(int32_t, libyuv_cpu_info, 0, +ABSL_FLAG(int32_t, + libyuv_cpu_info, + 0, "cpu flags for benchmark code. 1 = C, -1 = SIMD"); #else // Disable command line parameters if absl/flags disabled. |