aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-04 12:33:02 -0800
committerFrank Barchard <fbarchard@chromium.org>2021-03-05 01:09:37 +0000
commitba033a11e3948e4b361e0414caa57f793584b46e (patch)
tree1037b49cad50b9564db77c505aec9740f2bc88f6
parent95ff456c3335c2c541e2bc5038a2b01eea08cd33 (diff)
downloadlibyuv-ba033a11e3948e4b361e0414caa57f793584b46e.tar.gz
Add 12 bit YUV to 10 bit RGB
Bug: libyuv:843 Change-Id: I0104c8fcaeed09e83d2fd654c6a5e7d41bcb74cf Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2727775 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com>
-rw-r--r--docs/formats.md25
-rw-r--r--include/libyuv/convert_argb.h28
-rw-r--r--include/libyuv/row.h111
-rw-r--r--include/libyuv/scale_row.h16
-rw-r--r--source/convert_argb.cc140
-rw-r--r--source/row_any.cc48
-rw-r--r--source/row_common.cc439
-rw-r--r--source/row_gcc.cc623
-rw-r--r--source/scale.cc2
-rw-r--r--unit_test/convert_test.cc200
-rw-r--r--unit_test/scale_argb_test.cc2
-rw-r--r--unit_test/scale_test.cc2
-rw-r--r--unit_test/scale_uv_test.cc2
-rw-r--r--unit_test/unit_test.cc8
14 files changed, 1128 insertions, 518 deletions
diff --git a/docs/formats.md b/docs/formats.md
index a29ed5c3..5fc19d45 100644
--- a/docs/formats.md
+++ b/docs/formats.md
@@ -4,7 +4,9 @@ Formats (FOURCC) supported by libyuv are detailed here.
# Core Formats
-There are 2 core formats supported by libyuv - I420 and ARGB. All YUV formats can be converted to/from I420. All RGB formats can be converted to/from ARGB.
+There are 2 core formats supported by libyuv - I420 and ARGB.
+ All YUV formats can be converted to/from I420.
+ All RGB formats can be converted to/from ARGB.
Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
@@ -109,6 +111,27 @@ The following is extracted from video_common.h as a complete list of formats sup
I444, NV24 and NV42 are full width, full height
I400 and J400 have no chroma channel.
+# Color space
+ The YUV formats start with a letter to specify the color space. e.g. I420
+ I = BT.601 limited range
+ J = BT.601 full range (J = JPEG that uses this)
+ H = BT.709 limited range (H for HD)
+ F = BT.709 full range (F for Full range)
+ U = BT.2020 limited range (U for UHD)
+ V = BT.2020 full range
+ For YUV to RGB conversions, a matrix can be passed. See also convert_argh.h
+
+# HDR formats
+ Planar formats with 10 or 12 bits use the following fourcc:
+ I010, I012, P010, P012 are half width, half height
+ I210, I212, P210, P212 are half width, full height
+ I410, I412, P410, P412 are full width, full height
+ where
+ I is the color space (see above) and 3 planes: Y, U and V.
+ P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits. There is a Y plane and a UV plane.
+ 0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4
+ 10 or 12 is the bits per channel. The bits are in the low bits of a 16 bit channel.
+
# The ARGB FOURCC
There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers.
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index 419e7430..474a8214 100644
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -1488,6 +1488,34 @@ int I010ToARGBMatrix(const uint16_t* src_y,
int width,
int height);
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
// Convert 10 bit 422 YUV to ARGB with matrix.
LIBYUV_API
int I210ToARGBMatrix(const uint16_t* src_y,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 76536314..98514f46 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -282,6 +282,8 @@ extern "C" {
#define HAS_HALFMERGEUVROW_SSSE3
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I212TOAR30ROW_SSSE3
+#define HAS_I212TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_I410TOAR30ROW_SSSE3
@@ -320,6 +322,8 @@ extern "C" {
#define HAS_MERGEARGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
+#define HAS_I212TOAR30ROW_AVX2
+#define HAS_I212TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
#define HAS_I410TOAR30ROW_AVX2
#define HAS_I410TOARGBROW_AVX2
@@ -721,9 +725,15 @@ struct YuvConstants {
#else
// This struct is for Intel color conversion.
struct YuvConstants {
+#if LIBYUV_UNLIMITED_DATA
+ uint8_t kUVToB[32];
+ uint8_t kUVToG[32];
+ uint8_t kUVToR[32];
+#else
int8_t kUVToB[32];
int8_t kUVToG[32];
int8_t kUVToR[32];
+#endif
int16_t kUVBiasB[16];
int16_t kUVBiasG[16];
int16_t kUVBiasR[16];
@@ -2040,10 +2050,10 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
int depth,
int width);
void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
- const uint16_t* src_v,
- uint16_t* dst_uv,
- int depth,
- int width);
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width);
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
@@ -2591,6 +2601,18 @@ void I210ToARGBRow_C(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_C(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -2617,7 +2639,6 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-
void I444AlphaToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -2769,6 +2790,18 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_SSSE3(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -2813,6 +2846,18 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_AVX2(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -3081,6 +3126,18 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_Any_SSSE3(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -3125,6 +3182,18 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_Any_AVX2(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -3788,25 +3857,25 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
const struct YuvConstants* yuvconstants,
int width);
void P210ToARGBRow_NEON(const uint16_t* y_buf,
- const uint16_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void P410ToARGBRow_NEON(const uint16_t* y_buf,
- const uint16_t* uv_buf,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
void P210ToAR30Row_NEON(const uint16_t* y_buf,
- const uint16_t* uv_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width);
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
void P410ToAR30Row_NEON(const uint16_t* y_buf,
- const uint16_t* uv_buf,
- uint8_t* dst_ar30,
- const struct YuvConstants* yuvconstants,
- int width);
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_argb,
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 9ad51a56..86a2cf08 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -626,13 +626,13 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
- uint16_t* dst_ptr,
- int dst_width);
+ uint16_t* dst_ptr,
+ int dst_width);
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
- ptrdiff_t src_stride,
- uint16_t* dst_ptr,
- ptrdiff_t dst_stride,
- int dst_width);
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
@@ -682,8 +682,8 @@ void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
- uint16_t* dst_ptr,
- int dst_width);
+ uint16_t* dst_ptr,
+ int dst_width);
void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index eb185b6e..2b3d52d2 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -888,6 +888,63 @@ int U010ToAB30(const uint16_t* src_y,
&kYuv2020Constants, width, height);
}
+// Convert 12 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I212ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I212TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I212ToAR30Row = I212ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I212TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I212ToAR30Row = I212ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I212ToAR30Row = I212ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
// Convert 10 bit YUV to ARGB with matrix.
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
@@ -1061,7 +1118,7 @@ int I410ToAR30Matrix(const uint16_t* src_y,
void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
- I410ToAR30Row_C;
+ I410ToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
@@ -1260,6 +1317,61 @@ int U010ToABGR(const uint16_t* src_y,
width, height);
}
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I212ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I212TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I212ToARGBRow = I212ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I212TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I212ToARGBRow = I212ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I212ToARGBRow = I212ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
// Convert 10 bit 422 YUV to ARGB with matrix.
LIBYUV_API
int I210ToARGBMatrix(const uint16_t* src_y,
@@ -1437,7 +1549,7 @@ int I410ToARGBMatrix(const uint16_t* src_y,
void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
- I410ToARGBRow_C;
+ I410ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1484,9 +1596,9 @@ int P010ToARGBMatrix(const uint16_t* src_y,
int width,
int height) {
int y;
- void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- P210ToARGBRow_C;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1534,9 +1646,9 @@ int P210ToARGBMatrix(const uint16_t* src_y,
int width,
int height) {
int y;
- void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- P210ToARGBRow_C;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1582,9 +1694,9 @@ int P010ToAR30Matrix(const uint16_t* src_y,
int width,
int height) {
int y;
- void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- P210ToAR30Row_C;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
@@ -1632,9 +1744,9 @@ int P210ToAR30Matrix(const uint16_t* src_y,
int width,
int height) {
int y;
- void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- P210ToAR30Row_C;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
diff --git a/source/row_any.cc b/source/row_any.cc
index bcb59ea7..f68d2ed6 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -138,19 +138,47 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
}
#ifdef HAS_I210ALPHATOARGBROW_SSSE3
-ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
+ I210AlphaToARGBRow_SSSE3,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
#endif
#ifdef HAS_I210ALPHATOARGBROW_AVX2
-ANY41CT(I210AlphaToARGBRow_Any_AVX2, I210AlphaToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
+ I210AlphaToARGBRow_AVX2,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
#endif
#ifdef HAS_I410ALPHATOARGBROW_SSSE3
-ANY41CT(I410AlphaToARGBRow_Any_SSSE3, I410AlphaToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
+ I410AlphaToARGBRow_SSSE3,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
#endif
#ifdef HAS_I410ALPHATOARGBROW_AVX2
-ANY41CT(I410AlphaToARGBRow_Any_AVX2, I410AlphaToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
+ I410AlphaToARGBRow_AVX2,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
#endif
#undef ANY41CT
@@ -382,6 +410,18 @@ ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
#ifdef HAS_I210TOARGBROW_MMI
ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
#endif
+#ifdef HAS_I212TOAR30ROW_SSSE3
+ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_SSSE3
+ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_AVX2
+ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_AVX2
+ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
#undef ANY31CT
// Any 2 planes to 1.
diff --git a/source/row_common.cc b/source/row_common.cc
index ad4e95ea..d959ccd1 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -21,6 +21,11 @@ namespace libyuv {
extern "C" {
#endif
+// These 2 macros control YUV to RGB using unsigned math to extend range.
+// They can be used separately to enable new code and old data (clamped)
+// LIBYUV_UNLIMITED_DATA
+// LIBYUV_UNLIMITED_CODE
+
// The following ifdef from row_win makes the C code match the row_win code,
// which is 7 bit fixed point.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
@@ -1395,7 +1400,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
+#if LIBYUV_UNLIMITED_DATA
+#define UB 129 /* round(2.018 * 64) */
+#else
#define UB 128 /* max(128, round(2.018 * 64)) */
+#endif
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR 102 /* round(1.596 * 64) */
@@ -1444,9 +1453,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// B = (Y - 16) * 1.164 + U * 2.112
// KR = 0.2126, KB = 0.0722
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
+#if LIBYUV_UNLIMITED_DATA
+#define UB 135 /* round(2.112 * 64) */
+#else
#define UB 128 /* max(128, round(2.112 * 64)) */
+#endif
#define UG 14 /* round(0.213 * 64) */
#define VG 34 /* round(0.533 * 64) */
#define VR 115 /* round(1.793 * 64) */
@@ -1495,9 +1507,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// B = (Y - 16) * 1.164384 + U * 2.14177
// KR = 0.2627; KB = 0.0593
-// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
// U and V contributions to R,G,B.
+#if LIBYUV_UNLIMITED_DATA
+#define UB 137 /* round(2.142 * 64) */
+#else
#define UB 128 /* max(128, round(2.142 * 64)) */
+#endif
#define UG 12 /* round(0.187326 * 64) */
#define VG 42 /* round(0.65042 * 64) */
#define VR 107 /* round(1.67867 * 64) */
@@ -1545,15 +1560,61 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef MAKEYUVCONSTANTS
+#if LIBYUV_UNLIMITED_DATA
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+ int ub = yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ub = -yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp((int32_t)(y1 + (u * ub) + bb) >> 6);
+ *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
+ *r = Clamp((int32_t)(y1 + (v * vr) + br) >> 6);
+}
+#else
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 8 bit.
-static __inline void YuvPixel8_8(uint8_t y,
- uint8_t u,
- uint8_t v,
- uint8_t* b,
- uint8_t* g,
- uint8_t* r,
- const struct YuvConstants* yuvconstants) {
+static __inline void YuvPixel(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
@@ -1584,10 +1645,11 @@ static __inline void YuvPixel8_8(uint8_t y,
#endif
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
- *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
- *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
+ *b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6);
+ *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
+ *r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6);
}
+#endif
// Reads 8 bit YUV and leaves result as 16 bit.
static __inline void YuvPixel8_16(uint8_t y,
@@ -1627,9 +1689,9 @@ static __inline void YuvPixel8_16(uint8_t y,
#endif
uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+ *b = (int)(y1 - (u * ub) + bb);
+ *g = (int)(y1 - (u * ug + v * vg) + bg);
+ *r = (int)(y1 - (v * vr) + br);
}
// C reference code that mimics the YUV 16 bit assembly.
@@ -1678,15 +1740,61 @@ static __inline void YuvPixel10_16(uint16_t y,
*r = (int)(-(v * vr) + y1 + br);
}
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+ int16_t u,
+ int16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16;
+ u = clamp255(u >> 4);
+ v = clamp255(v >> 4);
+ *b = (int)(-(u * ub) + y1 + bb);
+ *g = (int)(-(u * ug + v * vg) + y1 + bg);
+ *r = (int)(-(v * vr) + y1 + br);
+}
+
// C reference code that mimics the YUV 10 bit assembly.
// Reads 10 bit YUV and clamps down to 8 bit RGB.
-static __inline void YuvPixel10_8(uint16_t y,
- uint16_t u,
- uint16_t v,
- uint8_t* b,
- uint8_t* g,
- uint8_t* r,
- const struct YuvConstants* yuvconstants) {
+static __inline void YuvPixel10(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
int b16;
int g16;
int r16;
@@ -1696,6 +1804,24 @@ static __inline void YuvPixel10_8(uint16_t y,
*r = Clamp(r16 >> 6);
}
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ int b16;
+ int g16;
+ int r16;
+ YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = Clamp(b16 >> 6);
+ *g = Clamp(g16 >> 6);
+ *r = Clamp(r16 >> 6);
+}
+
// C reference code that mimics the YUV 16 bit assembly.
// Reads 16 bit YUV and leaves result as 8 bit.
static __inline void YuvPixel16_8(uint16_t y,
@@ -1783,9 +1909,9 @@ static __inline void YuvPixel16_16(uint16_t y,
uint32_t y1 = (uint32_t)(y * yg) >> 16;
u = clamp255(u >> 8);
v = clamp255(v >> 8);
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+ *b = (int)(y1 + -(u * ub) + bb);
+ *g = (int)(y1 + -(u * ug + v * vg) + bg);
+ *r = (int)(y1 + -(v * vr) + br);
}
// C reference code that mimics the YUV assembly.
@@ -1822,11 +1948,11 @@ void I444ToARGBRow_C(const uint8_t* src_y,
for (x = 0; x < width - 1; x += 2) {
uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
- yuvconstants);
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+ yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
- yuvconstants);
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+ yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 2;
@@ -1834,8 +1960,8 @@ void I444ToARGBRow_C(const uint8_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1848,8 +1974,8 @@ void I444ToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
@@ -1868,11 +1994,11 @@ void I422ToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
@@ -1880,8 +2006,8 @@ void I422ToARGBRow_C(const uint8_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1895,11 +2021,11 @@ void I210ToARGBRow_C(const uint16_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
@@ -1907,8 +2033,8 @@ void I210ToARGBRow_C(const uint16_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -1921,8 +2047,8 @@ void I410ToARGBRow_C(const uint16_t* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
@@ -1940,11 +2066,11 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = clamp255(src_a[0] >> 2);
- YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = clamp255(src_a[1] >> 2);
src_y += 2;
src_u += 1;
@@ -1953,8 +2079,8 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = clamp255(src_a[0] >> 2);
}
}
@@ -1968,8 +2094,8 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = clamp255(src_a[0] >> 2);
src_y += 1;
src_u += 1;
@@ -1979,6 +2105,33 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
}
}
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
uint32_t ar30;
b = b >> 4; // convert 8 bit 10.6 to 10 bit.
@@ -2018,6 +2171,33 @@ void I210ToAR30Row_C(const uint16_t* src_y,
}
}
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
void I410ToAR30Row_C(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -2038,6 +2218,7 @@ void I410ToAR30Row_C(const uint16_t* src_y,
}
}
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
void P210ToARGBRow_C(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* rgb_buf,
@@ -2163,11 +2344,11 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
for (x = 0; x < width - 1; x += 2) {
uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
- yuvconstants);
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+ yuvconstants);
rgb_buf[3] = src_a[0];
- YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
- yuvconstants);
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+ yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 2;
@@ -2176,8 +2357,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
@@ -2191,8 +2372,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width; ++x) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
src_y += 1;
src_u += 1;
@@ -2212,11 +2393,11 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 1;
@@ -2225,8 +2406,8 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
@@ -2239,18 +2420,18 @@ void I422ToRGB24Row_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
- rgb_buf + 5, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
}
}
@@ -2268,8 +2449,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
@@ -2284,7 +2465,7 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
dst_argb4444 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
@@ -2306,8 +2487,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
@@ -2322,7 +2503,7 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
dst_argb1555 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
@@ -2344,8 +2525,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -2360,7 +2541,7 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
dst_rgb565 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -2375,19 +2556,19 @@ void NV12ToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_uv += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2399,19 +2580,19 @@ void NV21ToARGBRow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_vu += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2423,17 +2604,17 @@ void NV12ToRGB24Row_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
- YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
- rgb_buf + 5, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
src_y += 2;
src_uv += 2;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
}
}
@@ -2444,17 +2625,17 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
- YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
- rgb_buf + 5, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
src_y += 2;
src_vu += 2;
rgb_buf += 6; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
}
}
@@ -2471,8 +2652,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
- YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -2486,7 +2667,7 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
dst_rgb565 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -2500,18 +2681,18 @@ void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_yuy2 += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2522,18 +2703,18 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YuvPixel8_8(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
- rgb_buf + 6, yuvconstants);
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_uyvy += 4;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2546,11 +2727,11 @@ void I422ToRGBARow_C(const uint8_t* src_y,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
- rgb_buf + 3, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
rgb_buf[0] = 255;
- YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
- rgb_buf + 7, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+ rgb_buf + 7, yuvconstants);
rgb_buf[4] = 255;
src_y += 2;
src_u += 1;
@@ -2558,8 +2739,8 @@ void I422ToRGBARow_C(const uint8_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
- rgb_buf + 3, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
rgb_buf[0] = 255;
}
}
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 2c823a13..f4d9978b 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2001,6 +2001,19 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
"packuswb %%xmm5,%%xmm5 \n" \
"lea 0x10(%[a_buf]),%[a_buf] \n"
+// Read 4 UV from 422 12 bit, upsample to 8 UV
+#define READYUV212 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm0 \n" \
+ "psraw $0x4,%%xmm0 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $0x4,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
"movd (%[u_buf]),%%xmm0 \n" \
@@ -2398,6 +2411,36 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
);
}
+// 12 bit YUV to ARGB
+void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
// 10 bit YUV to AR30
void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
const uint16_t* u_buf,
@@ -2433,6 +2476,41 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
);
}
+// 12 bit YUV to AR30
+void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
// 10 bit YUV to ARGB
void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
const uint16_t* u_buf,
@@ -2443,16 +2521,16 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV410
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2474,29 +2552,26 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile(
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ YUVTORGB_SETUP(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
- LABELALIGN
- "1: \n"
- READYUVA210
- YUVTORGB(yuvconstants)
- STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [u_buf] "+r"(u_buf), // %[u_buf]
- [v_buf] "+r"(v_buf), // %[v_buf]
- [a_buf] "+r"(a_buf),
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ LABELALIGN "1: \n" READYUVA210
+ YUVTORGB(yuvconstants) STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
#if defined(__i386__)
- [width]"+m"(width) // %[width]
+ [width] "+m"(width) // %[width]
#else
- [width]"+rm"(width) // %[width]
+ [width] "+rm"(width) // %[width]
#endif
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
}
#endif
@@ -2511,29 +2586,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile(
- YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ YUVTORGB_SETUP(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
- LABELALIGN
- "1: \n"
- READYUVA410
- YUVTORGB(yuvconstants)
- STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [u_buf] "+r"(u_buf), // %[u_buf]
- [v_buf] "+r"(v_buf), // %[v_buf]
- [a_buf] "+r"(a_buf),
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ LABELALIGN "1: \n" READYUVA410
+ YUVTORGB(yuvconstants) STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
#if defined(__i386__)
- [width]"+m"(width) // %[width]
+ [width] "+m"(width) // %[width]
#else
- [width]"+rm"(width) // %[width]
+ [width] "+rm"(width) // %[width]
#endif
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
}
#endif
@@ -2547,21 +2619,21 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV410
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2729,26 +2801,22 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
+ asm volatile(
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- READP210
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [uv_buf] "+r"(uv_buf), // %[u_buf]
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
- [width] "+rm"(width) // %[width]
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN "1: \n" READP210
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
}
void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
@@ -2756,25 +2824,22 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
+ asm volatile(
- YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- READP410
- YUVTORGB(yuvconstants)
- STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [uv_buf] "+r"(uv_buf), // %[u_buf]
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
- [width] "+rm"(width) // %[width]
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
- "xmm5");
+ LABELALIGN "1: \n" READP410
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
}
void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
@@ -2785,20 +2850,20 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READP210
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
@@ -2817,20 +2882,20 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READP410
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
@@ -2948,6 +3013,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpsllw $6,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
+// Read 8 UV from 212 12 bit, upsample to 16 UV
+#define READYUV212_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $0x4,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
// Read 16 UV from 410. With 16 Alpha.
#define READYUVA410_AVX2 \
"vmovdqu (%[u_buf]),%%ymm0 \n" \
@@ -3295,6 +3375,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
}
#endif // HAS_I210TOARGBROW_AVX2
+#if defined(HAS_I212TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I212TOARGBROW_AVX2
+
#if defined(HAS_I210TOAR30ROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
@@ -3335,6 +3450,46 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
}
#endif // HAS_I210TOAR30ROW_AVX2
+#if defined(HAS_I212TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I212TOAR30ROW_AVX2
+
#if defined(HAS_I410TOARGBROW_AVX2)
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
@@ -3347,17 +3502,17 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV410_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
@@ -3383,32 +3538,28 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile(
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
- LABELALIGN
- "1: \n"
- READYUVA210_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ LABELALIGN "1: \n" READYUVA210_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [u_buf] "+r"(u_buf), // %[u_buf]
- [v_buf] "+r"(v_buf), // %[v_buf]
- [a_buf] "+r"(a_buf), // %[a_buf]
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
#if defined(__i386__)
- [width]"+m"(width) // %[width]
+ [width] "+m"(width) // %[width]
#else
- [width]"+rm"(width) // %[width]
+ [width] "+rm"(width) // %[width]
#endif
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5"
- );
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
}
#endif // HAS_I210TOARGBROW_AVX2
@@ -3424,32 +3575,28 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile(
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
- LABELALIGN
- "1: \n"
- READYUVA410_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ LABELALIGN "1: \n" READYUVA410_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
- : [y_buf] "+r"(y_buf), // %[y_buf]
- [u_buf] "+r"(u_buf), // %[u_buf]
- [v_buf] "+r"(v_buf), // %[v_buf]
- [a_buf] "+r"(a_buf), // %[a_buf]
- [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
#if defined(__i386__)
- [width]"+m"(width) // %[width]
+ [width] "+m"(width) // %[width]
#else
- [width]"+rm"(width) // %[width]
+ [width] "+rm"(width) // %[width]
#endif
- : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5"
- );
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
}
#endif // HAS_I410TOARGBROW_AVX2
@@ -3465,23 +3612,23 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV410_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -3764,16 +3911,16 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READP210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -3797,16 +3944,16 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READP410_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -3830,22 +3977,22 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READP210_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
@@ -3868,22 +4015,22 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READP410_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
@@ -4409,33 +4556,33 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
depth = 16 - depth;
// clang-format off
asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%xmm3 \n"
- "vbroadcastf128 %5,%%ymm4 \n"
- "sub %1,%2 \n"
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%xmm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+ "sub %1,%2 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
-
- "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
- "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vextractf128 $0x0,%%ymm0,(%1) \n"
- "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
- "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
- "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+
+ "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
+ "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
+ "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4499,24 +4646,24 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width), // %2
@@ -5173,7 +5320,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
#if defined(__i386__)
"+m"(width) // %5
#else
- "+rm"(width) // %5
+ "+rm"(width) // %5
#endif
: "m"(kShuffleMaskARGBSplit) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
@@ -5264,7 +5411,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
#if defined(__i386__)
"+m"(width) // %5
#else
- "+rm"(width) // %5
+ "+rm"(width) // %5
#endif
: "m"(kShuffleMaskARGBSplit), // %6
"m"(kShuffleMaskARGBPermute) // %7
@@ -7981,7 +8128,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
#if defined(__x86_64__)
: "x"(scale) // %3
#else
- : "m"(scale) // %3
+ : "m"(scale) // %3
#endif
: "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
}
@@ -8019,7 +8166,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#if defined(__x86_64__)
: "x"(scale) // %3
#else
- : "m"(scale) // %3
+ : "m"(scale) // %3
#endif
: "memory", "cc", "xmm2", "xmm3", "xmm4");
}
diff --git a/source/scale.cc b/source/scale.cc
index 3ccd2111..a254737c 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1619,7 +1619,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
uint16_t* dst_ptr) {
void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
- ScaleRowUp2_Bilinear_16_Any_C;
+ ScaleRowUp2_Bilinear_16_Any_C;
int x;
// This function can only scale up by 2 times.
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index de5cd00e..84584582 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -3167,67 +3167,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
}
#endif // HAS_ABGRTOAR30ROW_AVX2
-// TODO(fbarchard): Fix clamping issue affected by U channel.
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- const int kBpc = 2; \
- align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
- align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
- align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = \
- (fastrand() & ((1 << S_DEPTH) - 1)); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = \
- (fastrand() & ((1 << S_DEPTH) - 1)); \
- reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = \
- (fastrand() & ((1 << S_DEPTH) - 1)); \
- } \
- memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B( \
- reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
- reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
- reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
- dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B( \
- reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
- reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
- reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
- dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
- } \
- for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
- EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \
- } \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- }
-
-#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, S_DEPTH) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \
- TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+// Provide matrix wrappers for 12 bit YUV
+#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
+ I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
@@ -3254,43 +3198,105 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \
I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(I410, 1, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(I410, 1, 1, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(H410, 1, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(H410, 1, 1, ABGR, 4, 4, 1, 10)
-TESTPLANAR16TOB(U410, 1, 1, ARGB, 4, 4, 1, 10)
-TESTPLANAR16TOB(U410, 1, 1, ABGR, 4, 4, 1, 10)
+// TODO(fbarchard): Fix clamping issue affected by U channel.
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \
+ BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ const int kBpc = 2; \
+ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
+ align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \
+ reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \
+ } \
+ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
+ } \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \
+ BPP_B, ALIGN, YALIGN) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+ ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1)
+
#ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(I410, 1, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(I410, 1, 1, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H410, 1, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(H410, 1, 1, AB30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U410, 1, 1, AR30, 4, 4, 1, 10)
-TESTPLANAR16TOB(U410, 1, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
#endif
#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index ac976612..cfc12f3a 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -302,7 +302,7 @@ TEST_FACTOR(3, 1, 3)
TEST_SCALETO1(name, width, height, Bilinear, 3)
TEST_SCALETO(ARGBScale, 1, 1)
-TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
TEST_SCALETO(ARGBScale, 320, 240)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 066bcfde..6da6b574 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -1025,7 +1025,7 @@ TEST_FACTOR(3, 1, 3, 0)
#endif
TEST_SCALETO(Scale, 1, 1)
-TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc
index 59eeee30..6e4649f8 100644
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@@ -166,7 +166,7 @@ TEST_FACTOR(3, 1, 3)
TEST_SCALETO1(name, width, height, Bilinear, 3)
TEST_SCALETO(UVScale, 1, 1)
-TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
TEST_SCALETO(UVScale, 320, 240)
TEST_SCALETO(UVScale, 569, 480)
TEST_SCALETO(UVScale, 640, 360)
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 85e3b717..e6dbc3ee 100644
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -26,9 +26,13 @@ unsigned int fastrand_seed = 0xfb;
ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image.");
ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image.");
ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test.");
-ABSL_FLAG(int32_t, libyuv_flags, 0,
+ABSL_FLAG(int32_t,
+ libyuv_flags,
+ 0,
"cpu flags for reference code. 1 = C, -1 = SIMD");
-ABSL_FLAG(int32_t, libyuv_cpu_info, 0,
+ABSL_FLAG(int32_t,
+ libyuv_cpu_info,
+ 0,
"cpu flags for benchmark code. 1 = C, -1 = SIMD");
#else
// Disable command line parameters if absl/flags disabled.