aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-02-05 16:14:25 -0800
committerFrank Barchard <fbarchard@chromium.org>2021-02-06 00:26:55 +0000
commit942c5084482d8592883be66151e0dea502f4cbc0 (patch)
treee02d37c46fb6e9d06d5165fc86ce483b70928524
parent60d37a064bc0307017537ed3091b1b0204213855 (diff)
downloadlibyuv-942c5084482d8592883be66151e0dea502f4cbc0.tar.gz
BT.2020 Full Range yuvconstants
new color util to compute constants needed based on white point. [ RUN ] LibYUVColorTest.TestFullYUVV hist -2 -1 0 1 2 red 0 1627136 13670144 1479936 0 green 319285 3456836 9243059 3440771 317265 blue 0 1561088 14202112 1014016 0 Bug: libyuv:877, b/178283356 Change-Id: If432ebfab76b01302fdb416a153c4f26ca0832d6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2678859 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
-rw-r--r--README.chromium2
-rw-r--r--include/libyuv/convert.h2
-rw-r--r--include/libyuv/convert_argb.h24
-rw-r--r--include/libyuv/version.h2
-rw-r--r--source/convert_from.cc2
-rw-r--r--source/row_common.cc312
-rw-r--r--source/scale.cc18
-rw-r--r--source/scale_any.cc2
-rw-r--r--source/scale_common.cc6
-rw-r--r--source/scale_gcc.cc661
-rw-r--r--source/scale_neon.cc6
-rw-r--r--source/scale_neon64.cc18
-rw-r--r--unit_test/color_test.cc131
-rw-r--r--unit_test/convert_test.cc56
-rw-r--r--util/color.cc118
15 files changed, 761 insertions, 599 deletions
diff --git a/README.chromium b/README.chromium
index d27d1aa3..bdd05f1f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1774
+Version: 1775
License: BSD
License File: LICENSE
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 50ffc2f0..137b30f1 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -213,7 +213,7 @@ int I010ToI410(const uint16_t* src_y,
// Convert I012 to I412
#define I012ToI412 I010ToI410
-// Convert I212 to I412
+// Convert I210 to I410
LIBYUV_API
int I210ToI410(const uint16_t* src_y,
int src_stride_y,
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index d9cc5bd2..cf7f923e 100644
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -21,18 +21,20 @@ extern "C" {
#endif
// Conversion matrix for YUV to RGB
-LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
-LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
-LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
-LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants; // BT.2020 full
// Conversion matrix for YVU to BGR
-LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
-LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
-LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
-LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full
// Macros for end swapped destination Matrix conversions.
// Swap UV and pass mirrored kYvuJPEGConstants matrix.
@@ -42,6 +44,8 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
#define kYuvF709ConstantsVU kYvuF709Constants
#define kYuvH709ConstantsVU kYvuH709Constants
#define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+
#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ff3c9dec..a57dfa53 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1774
+#define LIBYUV_VERSION 1775
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 6524f969..591e2782 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -30,6 +30,8 @@ static __inline int Abs(int v) {
}
// I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
static int I420ToI4xx(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
diff --git a/source/row_common.cc b/source/row_common.cc
index c3942cf7..eb889c83 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1330,234 +1330,218 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// Macros to create SIMD specific yuv to rgb conversion constants.
#if defined(__aarch64__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
+ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
+ {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
+ {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
+ {BB, BG, BR, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}}; \
+ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
+ {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
+ {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
+ {BR, BG, BB, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}};
+
+#elif defined(__arm__)
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, \
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, \
- {UG, VG, UG, VG, UG, VG, UG, VG}, \
- {UG, VG, UG, VG, UG, VG, UG, VG}, \
- {BB, BG, BR, YGB, 0, 0, 0, 0}, \
+ {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {BB, BG, BR, YB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, \
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, \
- {VG, UG, VG, UG, VG, UG, VG, UG}, \
- {VG, UG, VG, UG, VG, UG, VG, UG}, \
- {BR, BG, BB, YGB, 0, 0, 0, 0}, \
- {0x0101 * YG, YG, 0, 0}};
-
-#elif defined(__arm__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
- const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, \
- {BB, BG, BR, YGB, 0, 0, 0, 0}, \
- {0x0101 * YG, YG, 0, 0}}; \
- const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, \
- {BR, BG, BB, YGB, 0, 0, 0, 0}, \
+ {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {BR, BG, BB, YB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}};
#else
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
- const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
- {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
- YGB, YGB}}; \
- const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, \
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, \
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, \
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, \
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, \
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, \
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
- {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
- YGB, YGB}};
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
+ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
+ {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, \
+ -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0}, \
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+ {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, \
+ 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR}, \
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
+ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
+ {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, \
+ -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0}, \
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, \
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, \
+ {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, \
+ 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB}, \
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
#endif
// TODO(fbarchard): Generate SIMD structures from float matrix.
-// BT.601 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.596
-// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
-// B = (Y - 16) * 1.164 - U * -2.018
+// Bias values to round, and subtract 128 from U and V.
+#define BB (-UB * 128 + YB)
+#define BG (UG * 128 + VG * 128 + YB)
+#define BR (-VR * 128 + YB)
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.601 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
+#define UB 128 /* max(128, round(2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-MAKEYUVCONSTANTS(I601, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// JPEG YUV to RGB reference
-// * R = Y - V * -1.40200
-// * G = Y - U * 0.34414 - V * 0.71414
-// * B = Y - U * -1.77200
-
-// Y contribution to R,G,B. Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// * R = Y + V * 1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414 * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR 90 /* round(1.40200 * 64) */
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B. Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
-MAKEYUVCONSTANTS(JPEG, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// BT.709 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.793
-// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
-// B = (Y - 16) * 1.164 - U * -2.112
-
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.709 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 + U * 2.112
+// KR = 0.2126, KB = 0.0722
// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14 /* round(0.213 * 64) */
-#define VG 34 /* round(0.533 * 64) */
-#define VR -115 /* round(-1.793 * 64) */
+#define UB 128 /* max(128, round(2.112 * 64)) */
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-MAKEYUVCONSTANTS(H709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
// BT.709 full range YUV to RGB reference
-// R = Y - V * -1.5748
-// G = Y - U * 0.18732 - V * 0.46812
-// B = Y - U * -1.8556
-// WR = 0.2126
-// WB = 0.0722
-// WR and WB given, the equations are:
-// R = Y + (2 * (1 - WR)) * V;
-// G = Y - ((2 * ((WR * (1 - WR) * V) + (WB * (1 - WB) * U))) / (1 - WB - WR));
-// B = Y + (2 * (1 - WB)) * U;
-
-// Y contribution to R,G,B. Scale and bias. (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
-#define UB -119 /* round(-1.8556 * 64) */
-#define UG 12 /* round(0.18732 * 64) */
-#define VG 30 /* round(0.46812 * 64) */
-#define VR -101 /* round(-1.5748 * 64) */
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12 /* round(0.18732 * 64) */
+#define VG 30 /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
-MAKEYUVCONSTANTS(F709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-
-// BT.2020 YUV to RGB reference
-// R = (Y - 16) * 1.164384 - V * -1.67867
-// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
-// B = (Y - 16) * 1.164384 - U * -2.14177
-// Y contribution to R,G,B. Scale and bias.
-#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+// BT.2020 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164384 + V * 1.67867
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+// B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.142 * 64)) */
-#define UG 12 /* round(0.187326 * 64) */
-#define VG 42 /* round(0.65042 * 64) */
-#define VR -107 /* round(-1.67867 * 64) */
+#define UB 128 /* max(128, round(2.142 * 64)) */
+#define UG 12 /* round(0.187326 * 64) */
+#define VG 42 /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B. Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
-MAKEYUVCONSTANTS(2020, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
+
+// BT.2020 full range YUV to RGB reference
+// R = Y + V * 1.474600
+// G = Y - U * 0.164553 - V * 0.571353
+// B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11 /* round(0.164553 * 64) */
+#define VG 37 /* round(0.571353 * 64) */
+#define VR 94 /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+
#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
#undef MAKEYUVCONSTANTS
diff --git a/source/scale.cc b/source/scale.cc
index 34c05699..16771cd8 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1336,7 +1336,7 @@ void ScalePlaneBilinearUp(int src_width,
}
}
-// Scale plane, horizontally 2 times, vertically any time.
+// Scale plane, horizontally up by 2 times.
// Uses linear filter horizontally, nearest vertically.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
@@ -1356,7 +1356,7 @@ void ScalePlaneUp2_Linear(int src_width,
int dy;
// This function can only scale up by 2 times horizontally.
- assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -1396,7 +1396,7 @@ void ScalePlaneUp2_Linear(int src_width,
}
}
-// Scale plane, 2 times.
+// Scale plane, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// This is used to scale U and V planes of I420 to I444.
@@ -1414,7 +1414,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
int x;
// This function can only scale up by 2 times.
- assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
@@ -1449,7 +1449,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
- // TODO test performance of writing one row of destination at a time
+ // TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
@@ -1458,7 +1458,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
}
}
-// Scale at most 14bit plane, horizontally 2 times.
+// Scale at most 14 bit plane, horizontally up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// stride is in count of uint16_t.
@@ -1478,7 +1478,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
int dy;
// This function can only scale up by 2 times horizontally.
- assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -1512,7 +1512,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
}
}
-// Scale at most 12bit plane, up 2 times.
+// Scale at most 12 bit plane, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// stride is in count of uint16_t.
@@ -1531,7 +1531,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
int x;
// This function can only scale up by 2 times.
- assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+ assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 5fd27ae6..79394985 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -625,7 +625,7 @@ CANY(ScaleARGBFilterCols_Any_MSA,
dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \
}
-// Even the C version need to be wrapped, because boundary pixels have to
+// Even the C versions need to be wrapped, because boundary pixels have to
// be handled differently
SUH2LANY(ScaleRowUp2_Linear_Any_C,
diff --git a/source/scale_common.cc b/source/scale_common.cc
index f53e2de9..8d41c03d 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -400,7 +400,7 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
//
// v dst_ptr at here v stop at here
// X O X X O X X O X X O X X O X
@@ -417,7 +417,7 @@ void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
}
}
-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
//
// src_ptr at here
// X v X X X X X X X X X
@@ -451,7 +451,7 @@ void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
}
}
-// only suitable for at most 14bit range.
+// Only suitable for at most 14 bit range.
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index cfbbba98..db3c9687 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -197,7 +197,6 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
-
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
@@ -485,7 +484,6 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf2) // %2
);
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -532,7 +530,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2
);
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
@@ -599,7 +596,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
);
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
@@ -692,7 +688,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAb2) // %3
);
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -736,7 +731,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAc33) // %2
);
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -790,7 +784,6 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
-
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
@@ -847,7 +840,6 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
LABELALIGN
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
@@ -962,7 +954,6 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
-
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
@@ -1015,7 +1006,6 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n"
@@ -1124,29 +1114,28 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
-
- "pcmpeqw %%xmm4,%%xmm4 \n"
- "psrlw $15,%%xmm4 \n"
- "psllw $1,%%xmm4 \n" // all 2
- "movdqu %3,%%xmm3 \n"
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqu %3,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n" // 01234567
- "movq 1(%0),%%xmm1 \n" // 12345678
- "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
- "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
- "movdqa %%xmm0,%%xmm2 \n"
- "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
- "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
- "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
- "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
- "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
- "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
- "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
- "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
- "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
@@ -1167,76 +1156,75 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
- "pcmpeqw %%xmm6,%%xmm6 \n"
- "psrlw $15,%%xmm6 \n"
- "psllw $3,%%xmm6 \n" // all 8
- "movdqu %5,%%xmm7 \n"
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqu %5,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n" // 01234567
- "movq 1(%0),%%xmm1 \n" // 12345678
- "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
- "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
- "movdqa %%xmm0,%%xmm2 \n"
- "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
- "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
- "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
- "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
-
- "movq (%0,%3),%%xmm1 \n"
- "movq 1(%0,%3),%%xmm4 \n"
- "punpcklwd %%xmm1,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "punpckhdq %%xmm4,%%xmm3 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
- "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 1(%0,%3),%%xmm4 \n"
+ "punpcklwd %%xmm1,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm4,%%xmm3 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
// xmm0 xmm2
// xmm1 xmm3
- "movdqa %%xmm0,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
- "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
- "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
- "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
- "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
-
- "movdqa %%xmm1,%%xmm5 \n"
- "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
- "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
- "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
- "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
- "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
-
- "movdqa %%xmm2,%%xmm0 \n"
- "movdqa %%xmm3,%%xmm1 \n"
- "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
- "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
- "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
- "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
- "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
-
- "movdqa %%xmm3,%%xmm1 \n"
- "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
- "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
- "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
- "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
- "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
-
- "packuswb %%xmm0,%%xmm4 \n"
- "movdqu %%xmm4,(%1) \n" // store above
- "packuswb %%xmm1,%%xmm5 \n"
- "movdqu %%xmm5,(%1,%4) \n" // store below
-
- "lea 0x8(%0),%0 \n"
- "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1257,30 +1245,29 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
-
- "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $15,%%ymm4,%%ymm4 \n"
- "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
- "vmovdqu %3,%%ymm3 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vmovdqu %3,%%ymm3 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
- "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
- "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
- "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
- "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
- "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
@@ -1301,72 +1288,71 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
- "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
- "vpsrlw $15,%%ymm6,%%ymm6 \n"
- "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
- "vmovdqu %5,%%ymm7 \n"
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vmovdqu %5,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
- "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
- "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
-
- "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
- "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
- "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
- "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
- "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
- "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
- "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
- "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
- "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
- "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
- "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
-
- "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
- "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
- "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
- "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
- "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
-
- "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
- "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
- "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
- "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
- "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
-
- "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
- "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
- "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
- "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
- "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
-
- "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
- "vmovdqu %%ymm4,(%1) \n" // store above
- "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
- "vmovdqu %%ymm5,(%1,%4) \n" // store below
-
- "lea 0x10(%0),%0 \n"
- "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1386,35 +1372,34 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
-
- "vmovdqu %3,%%ymm3 \n"
- "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $15,%%ymm4,%%ymm4 \n"
- "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vmovdqu %3,%%ymm3 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
- "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
-
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
-
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
- "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
- "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
- "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
- "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
- "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
- "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
- "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
- "vmovdqu %%ymm0,(%1) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
+ "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1427,37 +1412,36 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
-
- "vmovdqu %3,%%ymm3 \n"
- "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $31,%%ymm4,%%ymm4 \n"
- "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
+ "vmovdqu %3,%%ymm3 \n"
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
- "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
-
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
-
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
- "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
- "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
- "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
- "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
- "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
- "vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
- "vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
- "vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
- "vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
- "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
+ "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1473,57 +1457,56 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
- "vmovdqu %5,%%ymm5 \n"
- "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $15,%%ymm4,%%ymm4 \n"
- "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
+ "vmovdqu %5,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
- "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
- "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
- "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
- "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
- "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
- "vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
- "vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
-
- "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
- "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
- "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
- "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
- "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
- "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
- "vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
- "vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
-
- "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
- "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
- "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
- "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
- "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
- "vmovdqu %%ymm0,(%1) \n" // store above
-
- "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
- "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
- "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
- "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
- "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
- "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
-
- "lea 0x10(%0),%0 \n"
- "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
+
+ "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
+ "vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
+ "vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
+ "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1) \n" // store above
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
+ "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
+ "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1540,70 +1523,69 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
-
- "vmovdqu %5,%%ymm7 \n"
- "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpsrld $31,%%ymm6,%%ymm6 \n"
- "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+ "vmovdqu %5,%%ymm7 \n"
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
- "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
- "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
- "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
- "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
- "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
- "vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
- "vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
-
- "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
- "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
- "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
- "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
- "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
- "vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
- "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
- "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
- "vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
- "vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
-
- "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
- "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
- "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
- "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
- "vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
-
- "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
- "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
- "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
- "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
- "vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
-
- "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
- "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
- "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
- "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
- "vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
-
- "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
- "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
- "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
- "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
- "vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
-
- "vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
- "vmovdqu %%ymm4,(%1) \n" // store above
- "vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
- "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
-
- "lea 0x10(%0),%0 \n"
- "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
+ "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
+ "vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
+ "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
+ "vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
+ "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
+ "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
+ "vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
+ "vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -1620,7 +1602,6 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
-
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
@@ -1653,7 +1634,6 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
-
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
@@ -1776,8 +1756,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
- "m"(kFsub80), // %8
- "m"(kFadd40) // %9
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
@@ -1793,7 +1773,6 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
(void)x;
(void)dx;
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"
@@ -1820,7 +1799,6 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -1844,7 +1822,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -1870,7 +1847,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@@ -2057,7 +2033,6 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
(void)x;
(void)dx;
asm volatile(
-
LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 51061655..e260dc95 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -509,7 +509,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
-
"vmov.u16 q15, #3 \n"
"1: \n"
@@ -527,7 +526,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
"vst2.8 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -548,7 +547,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
-
"vmov.u16 q15, #3 \n"
"1: \n"
@@ -612,7 +610,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
-
"vmov.u16 q15, #3 \n"
"1: \n"
@@ -649,7 +646,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
-
"vmov.u16 q15, #3 \n"
"1: \n"
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 514dde4c..4b4f2fb1 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -540,7 +540,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
-
"movi v31.8b, #3 \n"
"1: \n"
@@ -580,7 +579,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
-
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
@@ -637,7 +635,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
-
"movi v31.8h, #3 \n"
"1: \n"
@@ -675,7 +672,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
-
"movi v31.8h, #3 \n"
"1: \n"
@@ -1317,13 +1313,13 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "ld1 {v0.h}[0], [%0], %6 \n"
- "ld1 {v1.h}[0], [%1], %6 \n"
- "ld1 {v2.h}[0], [%2], %6 \n"
- "ld1 {v3.h}[0], [%3], %6 \n"
- "subs %w5, %w5, #4 \n" // 4 pixels per loop.
- "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
- "b.gt 1b \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src1_ptr), // %1
"+r"(src2_ptr), // %2
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 95247214..60bdfdd6 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -257,6 +257,32 @@ static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
*r = orig_pixels[2];
}
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+ const int kWidth = 16;
+ const int kHeight = 1;
+ const int kPixels = kWidth * kHeight;
+ const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+ memset(orig_y, y, kPixels);
+ memset(orig_u, u, kHalfPixels);
+ memset(orig_v, v, kHalfPixels);
+
+ /* YUV converted to ARGB. */
+ V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+ orig_pixels, kWidth * 4, kWidth, kHeight);
+
+ *b = orig_pixels[0];
+ *g = orig_pixels[1];
+ *r = orig_pixels[2];
+}
+
static void YToRGB(int y, int* r, int* g, int* b) {
const int kWidth = 16;
const int kHeight = 1;
@@ -405,21 +431,21 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
EXPECT_LE(allb, 255);
}
-// BT.601 YUV to RGB reference
+// BT.601 limited range YUV to RGB reference
static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
*g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
*b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
}
-// JPEG YUV to RGB reference
+// BT.601 full range YUV to RGB reference (aka JPEG)
static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte(y - (v - 128) * -1.40200);
*g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
*b = RoundToByte(y - (u - 128) * -1.77200);
}
-// BT.709 YUV to RGB reference
+// BT.709 limited range YUV to RGB reference
// See also http://www.equasys.de/colorconversion.html
static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
@@ -434,7 +460,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*b = RoundToByte(y - (u - 128) * -1.8556);
}
-// BT.2020 YUV to RGB reference
+// BT.2020 limited range YUV to RGB reference
static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
*g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
@@ -442,6 +468,13 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
}
+// BT.2020 full range YUV to RGB reference
+static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+ *r = RoundToByte(y + (v - 128) * 1.474600);
+ *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
+ *b = RoundToByte(y + (u - 128) * 1.881400);
+}
+
TEST_F(LibYUVColorTest, TestYUV) {
int r0, g0, b0, r1, g1, b1;
@@ -573,16 +606,12 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
#else
#define FASTSTEP 5
#endif
+
+// BT.601 limited range.
TEST_F(LibYUVColorTest, TestFullYUV) {
- int rh[256] = {
- 0,
- };
- int gh[256] = {
- 0,
- };
- int bh[256] = {
- 0,
- };
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -602,16 +631,11 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
PrintHistogram(rh, gh, bh);
}
+// BT.601 full range.
TEST_F(LibYUVColorTest, TestFullYUVJ) {
- int rh[256] = {
- 0,
- };
- int gh[256] = {
- 0,
- };
- int bh[256] = {
- 0,
- };
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -631,16 +655,11 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
PrintHistogram(rh, gh, bh);
}
+// BT.709 limited range.
TEST_F(LibYUVColorTest, TestFullYUVH) {
- int rh[256] = {
- 0,
- };
- int gh[256] = {
- 0,
- };
- int bh[256] = {
- 0,
- };
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -661,16 +680,11 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
PrintHistogram(rh, gh, bh);
}
+// BT.709 full range.
TEST_F(LibYUVColorTest, TestFullYUVF) {
- int rh[256] = {
- 0,
- };
- int gh[256] = {
- 0,
- };
- int bh[256] = {
- 0,
- };
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -690,16 +704,11 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
PrintHistogram(rh, gh, bh);
}
+// BT.2020 limited range.
TEST_F(LibYUVColorTest, TestFullYUVU) {
- int rh[256] = {
- 0,
- };
- int gh[256] = {
- 0,
- };
- int bh[256] = {
- 0,
- };
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -719,6 +728,30 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
}
PrintHistogram(rh, gh, bh);
}
+
+// BT.2020 full range.
+TEST_F(LibYUVColorTest, TestFullYUVV) {
+ int rh[256] = { 0, };
+ int gh[256] = { 0, };
+ int bh[256] = { 0, };
+ for (int u = 0; u < 256; ++u) {
+ for (int v = 0; v < 256; ++v) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+ int r0, g0, b0, r1, g1, b1;
+ int y = RANDOM256(y2);
+ YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
+ YUVVToRGB(y, u, v, &r1, &g1, &b1);
+ EXPECT_NEAR(r0, r1, ERROR_R);
+ EXPECT_NEAR(g0, g1, 2);
+ EXPECT_NEAR(b0, b1, ERROR_B);
+ ++rh[r1 - r0 + 128];
+ ++gh[g1 - g0 + 128];
+ ++bh[b1 - b0 + 128];
+ }
+ }
+ }
+ PrintHistogram(rh, gh, bh);
+}
#undef FASTSTEP
TEST_F(LibYUVColorTest, TestGreyYUVJ) {
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index c180811a..20703200 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -558,7 +558,7 @@ TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-// Provide matrix wrappers
+// Provide matrix wrappers for full range bt.709
#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
#define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
@@ -572,6 +572,20 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
#define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+// Provide matrix wrappers for full range bt.2020
+#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+ I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
@@ -643,6 +657,8 @@ TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
@@ -667,6 +683,8 @@ TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
@@ -677,6 +695,8 @@ TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
@@ -772,6 +792,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
l, m)
@@ -796,6 +822,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
l, m)
@@ -820,6 +852,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+ l, m)
TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
@@ -829,6 +867,8 @@ TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
@@ -837,6 +877,8 @@ TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
@@ -845,6 +887,8 @@ TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
BPP_B, W1280, N, NEG, OFF) \
@@ -2771,6 +2815,8 @@ TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
@@ -2781,6 +2827,8 @@ TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@@ -2862,6 +2910,8 @@ TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
@@ -2872,6 +2922,8 @@ TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
@@ -2880,6 +2932,8 @@ TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
OFF, FMT_C, BPP_C) \
diff --git a/util/color.cc b/util/color.cc
new file mode 100644
index 00000000..2333276b
--- /dev/null
+++ b/util/color.cc
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709 KR = 0.2126; KB = 0.0722
+// 4 fcc KR = 0.30; KB = 0.11
+// 6 bt.601 KR = 0.299; KB = 0.114
+// 7 SMPTE 240M KR = 0.212; KB = 0.087
+// 10 bt2020 KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126
+// KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B. Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32 /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22 /* round(0.34414 * 64) */
+// #define VG 46 /* round(0.71414 * 64) */
+// #define VR 90 /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+ return (int) (v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+
+ if (argc < 2) {
+ printf("color kr kb\n");
+ return -1;
+ }
+ float kr = atof(argv[1]);
+ float kb = atof(argv[2]);
+ float kg = 1 - kr - kb;
+
+ float vr = 2 * (1 - kr);
+ float ug = 2 * ((1 - kb) * kb / kg);
+ float vg = 2 * ((1 - kr) * kr / kg);
+ float ub = 2 * (1 - kb);
+
+ printf("Full range\n");
+ printf("R = Y + V * %5f\n", vr);
+ printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+ printf("B = Y + U * %5f\n", ub);
+
+ printf("KR = %4f; ", kr);
+ printf("KB = %4f\n", kb);
+// printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32 /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+ printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+ printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+ printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+ printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+ vr = 255.f / 224.f * 2 * (1 - kr);
+ ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+ vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+ ub = 255.f / 224.f * 2 * (1 - kb);
+
+ printf("Limited range\n");
+ printf("R = (Y - 16) * 1.164 + V * %5f\n", vr);
+ printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+ printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+// printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32 /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+ printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+ printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+ printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+ printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+ return 0;
+}
+