BT.2020 Full Range yuvconstants

new color util to compute constants needed based on white point. [ RUN ] LibYUVColorTest.TestFullYUVV hist -2 -1 0 1 2 red 0 1627136 13670144 1479936 0 green 319285 3456836 9243059 3440771 317265 blue 0 1561088 14202112 1014016 0 Bug: libyuv:877, b/178283356 Change-Id: If432ebfab76b01302fdb416a153c4f26ca0832d6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2678859 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
author: Frank Barchard <fbarchard@google.com> 2021-02-05 16:14:25 -0800
committer: Frank Barchard <fbarchard@chromium.org> 2021-02-06 00:26:55 +0000
commit: 942c5084482d8592883be66151e0dea502f4cbc0 (patch)
tree: e02d37c46fb6e9d06d5165fc86ce483b70928524
parent: 60d37a064bc0307017537ed3091b1b0204213855 (diff)
download: libyuv-942c5084482d8592883be66151e0dea502f4cbc0.tar.gz
15 files changed, 761 insertions, 599 deletions
diff --git a/README.chromium b/README.chromium
index d27d1aa3..bdd05f1f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1774
+Version: 1775
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 50ffc2f0..137b30f1 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -213,7 +213,7 @@ int I010ToI410(const uint16_t* src_y,
 // Convert I012 to I412
 #define I012ToI412 I010ToI410
 
-// Convert I212 to I412
+// Convert I210 to I410
 LIBYUV_API
 int I210ToI410(const uint16_t* src_y,
                int src_stride_y,
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index d9cc5bd2..cf7f923e 100644
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -21,18 +21,20 @@ extern "C" {
 #endif
 
 // Conversion matrix for YUV to RGB
-LIBYUV_API extern const struct YuvConstants kYuvI601Constants;  // BT.601
-LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;  // JPeg
-LIBYUV_API extern const struct YuvConstants kYuvF709Constants;  // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYuvH709Constants;  // BT.709
-LIBYUV_API extern const struct YuvConstants kYuv2020Constants;  // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;   // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants;  // BT.2020 full
 
 // Conversion matrix for YVU to BGR
-LIBYUV_API extern const struct YuvConstants kYvuI601Constants;  // BT.601
-LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;  // JPeg
-LIBYUV_API extern const struct YuvConstants kYvuF709Constants;  // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYvuH709Constants;  // BT.709
-LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
 
 // Macros for end swapped destination Matrix conversions.
 // Swap UV and pass mirrored kYvuJPEGConstants matrix.
@@ -42,6 +44,8 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
 #define kYuvF709ConstantsVU kYvuF709Constants
 #define kYuvH709ConstantsVU kYvuH709Constants
 #define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+
 #define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
   NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
 #define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ff3c9dec..a57dfa53 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1774
+#define LIBYUV_VERSION 1775
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 6524f969..591e2782 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -30,6 +30,8 @@ static __inline int Abs(int v) {
 }
 
 // I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
 static int I420ToI4xx(const uint8_t* src_y,
                       int src_stride_y,
                       const uint8_t* src_u,
diff --git a/source/row_common.cc b/source/row_common.cc
index c3942cf7..eb889c83 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1330,234 +1330,218 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // Macros to create SIMD specific yuv to rgb conversion constants.
 
 #if defined(__aarch64__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)        \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {       \
+      {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
+      {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
+      {BB, BG, BR, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};         \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {       \
+      {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
+      {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
+      {BR, BG, BB, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};
+
+#elif defined(__arm__)
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)  \
   const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
-      {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},                     \
-      {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},                     \
-      {UG, VG, UG, VG, UG, VG, UG, VG},                             \
-      {UG, VG, UG, VG, UG, VG, UG, VG},                             \
-      {BB, BG, BR, YGB, 0, 0, 0, 0},                                \
+      {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {BB, BG, BR, YB, 0, 0, 0, 0},                                 \
       {0x0101 * YG, YG, 0, 0}};                                     \
   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
-      {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},                     \
-      {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},                     \
-      {VG, UG, VG, UG, VG, UG, VG, UG},                             \
-      {VG, UG, VG, UG, VG, UG, VG, UG},                             \
-      {BR, BG, BB, YGB, 0, 0, 0, 0},                                \
-      {0x0101 * YG, YG, 0, 0}};
-
-#elif defined(__arm__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR)     \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {     \
-      {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
-      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},         \
-      {BB, BG, BR, YGB, 0, 0, 0, 0},                                    \
-      {0x0101 * YG, YG, 0, 0}};                                         \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {     \
-      {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
-      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},         \
-      {BR, BG, BB, YGB, 0, 0, 0, 0},                                    \
+      {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {BR, BG, BB, YB, 0, 0, 0, 0},                                 \
       {0x0101 * YG, YG, 0, 0}};
 
 #else
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR)          \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {          \
-      {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,               \
-       UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},              \
-      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,       \
-       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},      \
-      {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,               \
-       0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},              \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},      \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},      \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},      \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},      \
-      {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
-       YGB, YGB}};                                                           \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {          \
-      {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,               \
-       VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},              \
-      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,       \
-       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},      \
-      {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,               \
-       0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},              \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},      \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},      \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},      \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},      \
-      {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
-       YGB, YGB}};
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)       \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {      \
+      {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0,   \
+       -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0},  \
+      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,   \
+       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},  \
+      {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR,   \
+       0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR},  \
+      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
+      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
+      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
+      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
+      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {      \
+      {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0,   \
+       -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0},  \
+      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,   \
+       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},  \
+      {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB,   \
+       0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB},  \
+      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
+      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
+      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
+      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
+      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
 #endif
 
 // TODO(fbarchard): Generate SIMD structures from float matrix.
 
-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
+// Bias values to round, and subtract 128 from U and V.
+#define BB (-UB * 128 + YB)
+#define BG (UG * 128 + VG * 128 + YB)
+#define BR (-VR * 128 + YB)
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.601 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.596
+//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+//  B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
 
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25   /* round(0.391 * 64) */
-#define VG 52   /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
+#define UB 128 /* max(128, round(2.018 * 64)) */
+#define UG 25  /* round(0.391 * 64) */
+#define VG 52  /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
 
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(I601, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
 
-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32   /* 64 / 2 */
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// *  R = Y               + V * 1.40200
+// *  G = Y - U * 0.34414 - V * 0.71414
+// *  B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
 
 // U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22   /* round(0.34414 * 64) */
-#define VG 46   /* round(0.71414  * 64) */
-#define VR -90  /* round(-1.40200 * 64) */
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22  /* round(0.34414 * 64) */
+#define VG 46  /* round(0.71414 * 64) */
+#define VR 90  /* round(1.40200 * 64) */
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(JPEG, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
 
-// BT.709 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.793
-//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
-//  B = (Y - 16) * 1.164 - U * -2.112
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.709 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.793
+//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+//  B = (Y - 16) * 1.164 + U * 2.112
+//  KR = 0.2126, KB = 0.0722
 
 // TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14   /* round(0.213 * 64) */
-#define VG 34   /* round(0.533  * 64) */
-#define VR -115 /* round(-1.793 * 64) */
+#define UB 128 /* max(128, round(2.112 * 64)) */
+#define UG 14  /* round(0.213 * 64) */
+#define VG 34  /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(H709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
 
 // BT.709 full range YUV to RGB reference
-//  R = Y                - V * -1.5748
-//  G = Y - U *  0.18732 - V *  0.46812
-//  B = Y - U * -1.8556
-//  WR = 0.2126
-//  WB = 0.0722
-//  WR and WB given, the equations are:
-//  R = Y + (2 * (1 - WR)) * V;
-//  G = Y - ((2 * ((WR * (1 - WR) * V) + (WB * (1 - WB) * U))) / (1 - WB - WR));
-//  B = Y + (2 * (1 - WB)) * U;
-
-// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YGB 32   /* 64 / 2 */
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126, KB = 0.0722
 
 // U and V contributions to R,G,B.
-#define UB -119 /* round(-1.8556 * 64) */
-#define UG 12   /* round(0.18732 * 64) */
-#define VG 30   /* round(0.46812 * 64) */
-#define VR -101 /* round(-1.5748 * 64) */
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12  /* round(0.18732 * 64) */
+#define VG 30  /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
 
-MAKEYUVCONSTANTS(F709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
-
-// BT.2020 YUV to RGB reference
-//  R = (Y - 16) * 1.164384                - V * -1.67867
-//  G = (Y - 16) * 1.164384 - U * 0.187326 - V *  0.65042
-//  B = (Y - 16) * 1.164384 - U * -2.14177
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 19003  /* round(1.164384 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+// BT.2020 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164384                + V * 1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+//  B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
 
 // TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.142 * 64)) */
-#define UG 12   /* round(0.187326 * 64) */
-#define VG 42   /* round(0.65042 * 64) */
-#define VR -107 /* round(-1.67867 * 64) */
+#define UB 128 /* max(128, round(2.142 * 64)) */
+#define UG 12  /* round(0.187326 * 64) */
+#define VG 42  /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
 
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
 
-MAKEYUVCONSTANTS(2020, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
+
+// BT.2020 full range YUV to RGB reference
+//  R = Y                + V * 1.474600
+//  G = Y - U * 0.164553 - V * 0.571353
+//  B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11  /* round(0.164553 * 64) */
+#define VG 37  /* round(0.571353 * 64) */
+#define VR 94  /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+
 #undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
 
 #undef MAKEYUVCONSTANTS
 
diff --git a/source/scale.cc b/source/scale.cc
index 34c05699..16771cd8 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1336,7 +1336,7 @@ void ScalePlaneBilinearUp(int src_width,
   }
 }
 
-// Scale plane, horizontally 2 times, vertically any time.
+// Scale plane, horizontally up by 2 times.
 // Uses linear filter horizontally, nearest vertically.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
@@ -1356,7 +1356,7 @@ void ScalePlaneUp2_Linear(int src_width,
   int dy;
 
   // This function can only scale up by 2 times horizontally.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
 
 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -1396,7 +1396,7 @@ void ScalePlaneUp2_Linear(int src_width,
   }
 }
 
-// Scale plane, 2 times.
+// Scale plane, up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of I420 to I444.
@@ -1414,7 +1414,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
   int x;
 
   // This function can only scale up by 2 times.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
   assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
 
 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
@@ -1449,7 +1449,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
     for (x = 0; x < src_height - 1; ++x) {
       Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
       src_ptr += src_stride;
-      // TODO test performance of writing one row of destination at a time
+      // TODO: Test performance of writing one row of destination at a time.
       dst_ptr += 2 * dst_stride;
     }
     if (!(dst_height & 1)) {
@@ -1458,7 +1458,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
   }
 }
 
-// Scale at most 14bit plane, horizontally 2 times.
+// Scale at most 14 bit plane, horizontally up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
@@ -1478,7 +1478,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
   int dy;
 
   // This function can only scale up by 2 times horizontally.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
 
 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -1512,7 +1512,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
   }
 }
 
-// Scale at most 12bit plane, up 2 times.
+// Scale at most 12 bit plane, up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
@@ -1531,7 +1531,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
   int x;
 
   // This function can only scale up by 2 times.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
   assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
 
 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 5fd27ae6..79394985 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -625,7 +625,7 @@ CANY(ScaleARGBFilterCols_Any_MSA,
     dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1];         \
   }
 
-// Even the C version need to be wrapped, because boundary pixels have to
+// Even the C versions need to be wrapped, because boundary pixels have to
 // be handled differently
 
 SUH2LANY(ScaleRowUp2_Linear_Any_C,
diff --git a/source/scale_common.cc b/source/scale_common.cc
index f53e2de9..8d41c03d 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -400,7 +400,7 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
   }
 }
 
-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
 //
 //      v dst_ptr at here           v stop at here
 //  X O X   X O X   X O X   X O X   X O X
@@ -417,7 +417,7 @@ void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
   }
 }
 
-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
 //
 //    src_ptr at here
 //  X v X   X   X   X   X   X   X   X   X
@@ -451,7 +451,7 @@ void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
   }
 }
 
-// only suitable for at most 14bit range.
+// Only suitable for at most 14 bit range.
 void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
                              uint16_t* dst_ptr,
                              int dst_width) {
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index cfbbba98..db3c9687 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -197,7 +197,6 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         int dst_width) {
   (void)src_stride;
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
@@ -485,7 +484,6 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf2)   // %2
   );
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -532,7 +530,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
@@ -599,7 +596,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
   );
 
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
@@ -692,7 +688,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kScaleAb2)  // %3
   );
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -736,7 +731,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kScaleAc33)  // %2
   );
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -790,7 +784,6 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
   asm volatile(
-
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
@@ -847,7 +840,6 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
@@ -962,7 +954,6 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
@@ -1015,7 +1006,6 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
   asm volatile(
-
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm7,%%xmm7                 \n"
       "psrlw       $15,%%xmm7                    \n"
@@ -1124,29 +1114,28 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
   asm volatile(
-
-      "pcmpeqw      %%xmm4,%%xmm4                \n"
-      "psrlw        $15,%%xmm4                   \n"
-      "psllw        $1,%%xmm4                    \n"  // all 2
-      "movdqu       %3,%%xmm3                    \n"
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqu      %3,%%xmm3                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movq         (%0),%%xmm0                  \n"  // 01234567
-      "movq         1(%0),%%xmm1                 \n"  // 12345678
-      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
-      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
-      "movdqa       %%xmm0,%%xmm2                \n"
-      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
-      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
-      "pmaddubsw    %%xmm3,%%xmm2                \n"  // 3*near+far (hi)
-      "pmaddubsw    %%xmm3,%%xmm0                \n"  // 3*near+far (lo)
-      "paddw        %%xmm4,%%xmm0                \n"  // 3*near+far+2 (lo)
-      "paddw        %%xmm4,%%xmm2                \n"  // 3*near+far+2 (hi)
-      "psrlw        $2,%%xmm0                    \n"  // 3/4*near+1/4*far (lo)
-      "psrlw        $2,%%xmm2                    \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb    %%xmm2,%%xmm0,%%xmm0         \n"
-      "vmovdqu      %%xmm0,(%1)                  \n"
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
 
       "lea         0x8(%0),%0                    \n"
       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
@@ -1167,76 +1156,75 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
   asm volatile(
-
-      "pcmpeqw      %%xmm6,%%xmm6                \n"
-      "psrlw        $15,%%xmm6                   \n"
-      "psllw        $3,%%xmm6                    \n"  // all 8
-      "movdqu       %5,%%xmm7                    \n"
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqu      %5,%%xmm7                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "movq         (%0),%%xmm0                  \n"  // 01234567
-      "movq         1(%0),%%xmm1                 \n"  // 12345678
-      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
-      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
-      "movdqa       %%xmm0,%%xmm2                \n"
-      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
-      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
-      "pmaddubsw    %%xmm7,%%xmm2                \n"  // 3*near+far (1, hi)
-      "pmaddubsw    %%xmm7,%%xmm0                \n"  // 3*near+far (1, lo)
-
-      "movq         (%0,%3),%%xmm1               \n"
-      "movq         1(%0,%3),%%xmm4              \n"
-      "punpcklwd    %%xmm1,%%xmm1                \n"
-      "punpcklwd    %%xmm4,%%xmm4                \n"
-      "movdqa       %%xmm1,%%xmm3                \n"
-      "punpckhdq    %%xmm4,%%xmm3                \n"
-      "punpckldq    %%xmm4,%%xmm1                \n"
-      "pmaddubsw    %%xmm7,%%xmm3                \n"  // 3*near+far (2, hi)
-      "pmaddubsw    %%xmm7,%%xmm1                \n"  // 3*near+far (2, lo)
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
+
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        1(%0,%3),%%xmm4               \n"
+      "punpcklwd   %%xmm1,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm4,%%xmm3                 \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
 
       // xmm0 xmm2
       // xmm1 xmm3
 
-      "movdqa       %%xmm0,%%xmm4                \n"
-      "movdqa       %%xmm1,%%xmm5                \n"
-      "paddw        %%xmm0,%%xmm4                \n"  // 6*near+2*far (1, lo)
-      "paddw        %%xmm6,%%xmm5                \n"  // 3*near+far+8 (2, lo)
-      "paddw        %%xmm0,%%xmm4                \n"  // 9*near+3*far (1, lo)
-      "paddw        %%xmm5,%%xmm4                \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw        $4,%%xmm4                    \n"  // ^ div by 16 (1, lo)
-
-      "movdqa       %%xmm1,%%xmm5                \n"
-      "paddw        %%xmm1,%%xmm5                \n"  // 6*near+2*far (2, lo)
-      "paddw        %%xmm6,%%xmm0                \n"  // 3*near+far+8 (1, lo)
-      "paddw        %%xmm1,%%xmm5                \n"  // 9*near+3*far (2, lo)
-      "paddw        %%xmm0,%%xmm5                \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrlw        $4,%%xmm5                    \n"  // ^ div by 16 (2, lo)
-
-      "movdqa       %%xmm2,%%xmm0                \n"
-      "movdqa       %%xmm3,%%xmm1                \n"
-      "paddw        %%xmm2,%%xmm0                \n"  // 6*near+2*far (1, hi)
-      "paddw        %%xmm6,%%xmm1                \n"  // 3*near+far+8 (2, hi)
-      "paddw        %%xmm2,%%xmm0                \n"  // 9*near+3*far (1, hi)
-      "paddw        %%xmm1,%%xmm0                \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw        $4,%%xmm0                    \n"  // ^ div by 16 (1, hi)
-
-      "movdqa       %%xmm3,%%xmm1                \n"
-      "paddw        %%xmm3,%%xmm1                \n"  // 6*near+2*far (2, hi)
-      "paddw        %%xmm6,%%xmm2                \n"  // 3*near+far+8 (1, hi)
-      "paddw        %%xmm3,%%xmm1                \n"  // 9*near+3*far (2, hi)
-      "paddw        %%xmm2,%%xmm1                \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrlw        $4,%%xmm1                    \n"  // ^ div by 16 (2, hi)
-
-      "packuswb     %%xmm0,%%xmm4                \n"
-      "movdqu       %%xmm4,(%1)                  \n"  // store above
-      "packuswb     %%xmm1,%%xmm5                \n"
-      "movdqu       %%xmm5,(%1,%4)               \n"  // store below
-
-      "lea          0x8(%0),%0                   \n"
-      "lea          0x10(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -1257,30 +1245,29 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
   asm volatile(
-
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
-      "vmovdqu      %3,%%ymm3                    \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
-      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
-      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
-      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
-      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
-      "vpmaddubsw   %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpmaddubsw   %%ymm3,%%ymm0,%%ymm0         \n"  // 3*near+far (lo)
-      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
-      "vpaddw       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
-      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
-      "vpsrlw       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb    %%ymm1,%%ymm0,%%ymm0         \n"
-      "vmovdqu      %%ymm0,(%1)                  \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
 
       "lea         0x10(%0),%0                   \n"
       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
@@ -1301,72 +1288,71 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
   asm volatile(
-
-      "vpcmpeqw     %%ymm6,%%ymm6,%%ymm6         \n"
-      "vpsrlw       $15,%%ymm6,%%ymm6            \n"
-      "vpsllw       $3,%%ymm6,%%ymm6             \n"  // all 8
-      "vmovdqu      %5,%%ymm7                    \n"
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
-      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
-      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
-      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
-      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
-      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
-      "vpmaddubsw   %%ymm7,%%ymm0,%%ymm0         \n"  // 3*near+far (1, lo)
-
-      "vmovdqu      (%0,%3),%%xmm2               \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0,%3),%%xmm3              \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"
-      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"
-      "vpunpcklwd   %%ymm2,%%ymm2,%%ymm2         \n"
-      "vpunpcklwd   %%ymm3,%%ymm3,%%ymm3         \n"
-      "vpunpckhdq   %%ymm3,%%ymm2,%%ymm4         \n"
-      "vpunpckldq   %%ymm3,%%ymm2,%%ymm2         \n"
-      "vpmaddubsw   %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
-      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm2         \n"  // 3*near+far (2, lo)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
 
       // ymm0 ymm1
       // ymm2 ymm3
 
-      "vpaddw       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
-      "vpaddw       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
-      "vpaddw       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
-      "vpaddw       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrlw       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
-
-      "vpaddw       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
-      "vpaddw       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
-      "vpaddw       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
-      "vpaddw       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrlw       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
-
-      "vpaddw       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
-      "vpaddw       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
-      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
-
-      "vpaddw       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
-      "vpaddw       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
-      "vpaddw       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
-      "vpaddw       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrlw       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
-
-      "vpackuswb    %%ymm0,%%ymm4,%%ymm4         \n"
-      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
-      "vpackuswb    %%ymm2,%%ymm5,%%ymm5         \n"
-      "vmovdqu      %%ymm5,(%1,%4)               \n"  // store below
-
-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 16 sample to 32 sample
-      "sub          $0x20,%2                     \n"
-      "jg           1b                           \n"
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -1386,35 +1372,34 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   asm volatile(
-
-      "vmovdqu      %3,%%ymm3                    \n"
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
-      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"  // 3*near+far
-      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2
-      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far
-      "vmovdqu      %%ymm0,(%1)                  \n"
-
-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm3,%%ymm1,%%ymm0          \n"  // 3*near+far (lo)
+      "vpmaddwd    %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm0          \n"  // 3*near+far
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),              // %0
         "+r"(dst_ptr),              // %1
         "+r"(dst_width)             // %2
@@ -1427,37 +1412,36 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      int dst_width) {
   asm volatile(
-
-      "vmovdqu      %3,%%ymm3                    \n"
-      "vpcmpeqd     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrld       $31,%%ymm4,%%ymm4            \n"
-      "vpslld       $1,%%ymm4,%%ymm4             \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $31,%%ymm4,%%ymm4             \n"
+      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
 
       LABELALIGN
       "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
-      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpaddd       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
-      "vpaddd       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
-      "vpsrad       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
-      "vpsrad       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"
-      "vmovdqu      %%ymm0,(%1)                  \n"
-
-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm3,%%ymm1,%%ymm0          \n"  // 3*near+far (lo)
+      "vpmaddwd    %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddd      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrad      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrad      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),              // %0
         "+r"(dst_ptr),              // %1
         "+r"(dst_width)             // %2
@@ -1473,57 +1457,56 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
   asm volatile(
-
-      "vmovdqu      %5,%%ymm5                    \n"
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $3,%%ymm4,%%ymm4             \n"  // all 8
+      "vmovdqu     %5,%%ymm5                     \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
 
       LABELALIGN
       "1:                                        \n"
 
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
-      "vpmaddwd     %%ymm5,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm2         \n"  // 3*near+far (1)
-
-      "vmovdqu      (%0,%3,2),%%xmm0             \n"  // 01234567 (16b)
-      "vmovdqu      2(%0,%3,2),%%xmm1            \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm3         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (2, lo)
-      "vpmaddwd     %%ymm5,%%ymm3,%%ymm1         \n"  // 3*near+far (2, hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm3         \n"  // 3*near+far (2)
-
-      "vpaddw       %%ymm2,%%ymm2,%%ymm0         \n"  // 6*near+2*far (1)
-      "vpaddw       %%ymm4,%%ymm3,%%ymm1         \n"  // 3*near+far+8 (2)
-      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9*near+3*far (1)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (1)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
-      "vmovdqu      %%ymm0,(%1)                  \n"  // store above
-
-      "vpaddw       %%ymm3,%%ymm3,%%ymm0         \n"  // 6*near+2*far (2)
-      "vpaddw       %%ymm4,%%ymm2,%%ymm1         \n"  // 3*near+far+8 (1)
-      "vpaddw       %%ymm0,%%ymm3,%%ymm0         \n"  // 9*near+3*far (2)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (2)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
-      "vmovdqu      %%ymm0,(%1,%4,2)             \n"  // store below
-
-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm5,%%ymm1,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpmaddwd    %%ymm5,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm2          \n"  // 3*near+far (1)
+
+      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
+      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm5,%%ymm1,%%ymm0          \n"  // 3*near+far (2, lo)
+      "vpmaddwd    %%ymm5,%%ymm3,%%ymm1          \n"  // 3*near+far (2, hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm3          \n"  // 3*near+far (2)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
+      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1)                   \n"  // store above
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
+      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
+      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -1540,70 +1523,69 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
                                        ptrdiff_t dst_stride,
                                        int dst_width) {
   asm volatile(
-
-      "vmovdqu      %5,%%ymm7                    \n"
-      "vpcmpeqd     %%ymm6,%%ymm6,%%ymm6         \n"
-      "vpsrld       $31,%%ymm6,%%ymm6            \n"
-      "vpslld       $3,%%ymm6,%%ymm6             \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrld      $31,%%ymm6,%%ymm6             \n"
+      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
 
       LABELALIGN
       "1:                                        \n"
 
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm7,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
-      "vpmaddwd     %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
-
-      "vmovdqu      (%0,%3,2),%%xmm2             \n"  // 01234567 (16b)
-      "vmovdqu      2(%0,%3,2),%%xmm3            \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm2,%%ymm2,%%ymm2         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm3,%%ymm3,%%ymm3         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm3,%%ymm2,%%ymm4         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm3,%%ymm2,%%ymm3         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm7,%%ymm3,%%ymm2         \n"  // 3*near+far (2, lo)
-      "vpmaddwd     %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
-
-      "vpaddd       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
-      "vpaddd       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
-      "vpaddd       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
-      "vpaddd       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrad       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
-
-      "vpaddd       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
-      "vpaddd       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
-      "vpaddd       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
-      "vpaddd       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrad       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
-
-      "vpaddd       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
-      "vpaddd       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
-      "vpaddd       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
-      "vpaddd       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrad       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
-
-      "vpaddd       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
-      "vpaddd       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
-      "vpaddd       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
-      "vpaddd       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrad       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
-
-      "vpackssdw    %%ymm0,%%ymm4,%%ymm4         \n"
-      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
-      "vpackssdw    %%ymm2,%%ymm5,%%ymm5         \n"
-      "vmovdqu      %%ymm5,(%1,%4,2)             \n"  // store below
-
-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm7,%%ymm1,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpmaddwd    %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+
+      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b)
+      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm3,%%ymm3,%%ymm3          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm7,%%ymm3,%%ymm2          \n"  // 3*near+far (2, lo)
+      "vpmaddwd    %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+
+      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrad      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrad      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrad      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrad      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackssdw   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackssdw   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
       : "+r"(src_ptr),                // %0
         "+r"(dst_ptr),                // %1
         "+r"(dst_width)               // %2
@@ -1620,7 +1602,6 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
   asm volatile(
-
       "pxor        %%xmm5,%%xmm5                 \n"
 
       // 16 pixel loop.
@@ -1653,7 +1634,6 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
   asm volatile(
-
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN
@@ -1776,8 +1756,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
         "x"(kFsub80),  // %8
         "x"(kFadd40)   // %9
 #else
-        "m"(kFsub80),  // %8
-        "m"(kFadd40)   // %9
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
 #endif
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7");
@@ -1793,7 +1773,6 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
   (void)x;
   (void)dx;
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
@@ -1820,7 +1799,6 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             int dst_width) {
   (void)src_stride;
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -1844,7 +1822,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   int dst_width) {
   (void)src_stride;
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -1870,7 +1847,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                uint8_t* dst_argb,
                                int dst_width) {
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -2057,7 +2033,6 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
   (void)x;
   (void)dx;
   asm volatile(
-
       LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 51061655..e260dc95 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -509,7 +509,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
   asm volatile(
-
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -527,7 +526,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
 
       "vst2.8      {d0, d1}, [%1]!               \n"  // store
       "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
-      "bgt        1b                             \n"
+      "bgt         1b                            \n"
       : "+r"(src_ptr),    // %0
         "+r"(dst_ptr),    // %1
         "+r"(dst_width),  // %2
@@ -548,7 +547,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
   asm volatile(
-
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -612,7 +610,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
   asm volatile(
-
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -649,7 +646,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
   asm volatile(
-
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 514dde4c..4b4f2fb1 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -540,7 +540,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
   asm volatile(
-
       "movi        v31.8b, #3                    \n"
 
       "1:                                        \n"
@@ -580,7 +579,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
   asm volatile(
-
       "movi        v31.8b, #3                    \n"
       "movi        v30.8h, #3                    \n"
 
@@ -637,7 +635,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
   asm volatile(
-
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -675,7 +672,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
   asm volatile(
-
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -1317,13 +1313,13 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
   (void)src_stride;
   asm volatile(
       "1:                                        \n"
-      "ld1        {v0.h}[0], [%0], %6            \n"
-      "ld1        {v1.h}[0], [%1], %6            \n"
-      "ld1        {v2.h}[0], [%2], %6            \n"
-      "ld1        {v3.h}[0], [%3], %6            \n"
-      "subs       %w5, %w5, #4                   \n"  // 4 pixels per loop.
-      "st4        {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.h}[0], [%0], %6           \n"
+      "ld1         {v1.h}[0], [%1], %6           \n"
+      "ld1         {v2.h}[0], [%2], %6           \n"
+      "ld1         {v3.h}[0], [%3], %6           \n"
+      "subs        %w5, %w5, #4                  \n"  // 4 pixels per loop.
+      "st4         {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+      "b.gt        1b                            \n"
       : "+r"(src_ptr),                 // %0
         "+r"(src1_ptr),                // %1
         "+r"(src2_ptr),                // %2
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 95247214..60bdfdd6 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -257,6 +257,32 @@ static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
   *r = orig_pixels[2];
 }
 
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
 static void YToRGB(int y, int* r, int* g, int* b) {
   const int kWidth = 16;
   const int kHeight = 1;
@@ -405,21 +431,21 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
   EXPECT_LE(allb, 255);
 }
 
-// BT.601 YUV to RGB reference
+// BT.601 limited range YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
   *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
   *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }
 
-// JPEG YUV to RGB reference
+// BT.601 full range YUV to RGB reference (aka JPEG)
 static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte(y - (v - 128) * -1.40200);
   *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
   *b = RoundToByte(y - (u - 128) * -1.77200);
 }
 
-// BT.709 YUV to RGB reference
+// BT.709 limited range YUV to RGB reference
 // See also http://www.equasys.de/colorconversion.html
 static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
@@ -434,7 +460,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *b = RoundToByte(y - (u - 128) * -1.8556);
 }
 
-// BT.2020 YUV to RGB reference
+// BT.2020 limited range YUV to RGB reference
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
   *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
@@ -442,6 +468,13 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
 }
 
+// BT.2020 full range YUV to RGB reference
+static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y                        + (v - 128) * 1.474600);
+  *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
+  *b = RoundToByte(y + (u - 128) * 1.881400);
+}
+
 TEST_F(LibYUVColorTest, TestYUV) {
   int r0, g0, b0, r1, g1, b1;
 
@@ -573,16 +606,12 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 #else
 #define FASTSTEP 5
 #endif
+
+// BT.601 limited range.
 TEST_F(LibYUVColorTest, TestFullYUV) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -602,16 +631,11 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
   PrintHistogram(rh, gh, bh);
 }
 
+// BT.601 full range.
 TEST_F(LibYUVColorTest, TestFullYUVJ) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -631,16 +655,11 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
   PrintHistogram(rh, gh, bh);
 }
 
+// BT.709 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVH) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -661,16 +680,11 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
   PrintHistogram(rh, gh, bh);
 }
 
+// BT.709 full range.
 TEST_F(LibYUVColorTest, TestFullYUVF) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -690,16 +704,11 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
   PrintHistogram(rh, gh, bh);
 }
 
+// BT.2020 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVU) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -719,6 +728,30 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
   }
   PrintHistogram(rh, gh, bh);
 }
+
+// BT.2020 full range.
+TEST_F(LibYUVColorTest, TestFullYUVV) {
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVVToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, 2);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
 #undef FASTSTEP
 
 TEST_F(LibYUVColorTest, TestGreyYUVJ) {
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index c180811a..20703200 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -558,7 +558,7 @@ TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
 TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
 TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
 
-// Provide matrix wrappers
+// Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
   I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
 #define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
@@ -572,6 +572,20 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
 #define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
   I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
 
+// Provide matrix wrappers for full range bt.2020
+#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
 #define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
 
 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
@@ -643,6 +657,8 @@ TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
@@ -667,6 +683,8 @@ TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
@@ -677,6 +695,8 @@ TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
@@ -772,6 +792,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 #define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
                         l, m)
@@ -796,6 +822,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 #define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
                         l, m)
@@ -820,6 +852,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 
 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
@@ -829,6 +867,8 @@ TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
@@ -837,6 +877,8 @@ TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
@@ -845,6 +887,8 @@ TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
 
 #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
                          BPP_B, W1280, N, NEG, OFF)                            \
@@ -2771,6 +2815,8 @@ TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
 TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
@@ -2781,6 +2827,8 @@ TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@@ -2862,6 +2910,8 @@ TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
@@ -2872,6 +2922,8 @@ TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
@@ -2880,6 +2932,8 @@ TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 
 #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
                       OFF, FMT_C, BPP_C)                                       \
diff --git a/util/color.cc b/util/color.cc
new file mode 100644
index 00000000..2333276b
--- /dev/null
+++ b/util/color.cc
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709      KR = 0.2126; KB = 0.0722
+// 4 fcc         KR = 0.30;   KB = 0.11
+// 6 bt.601      KR = 0.299;  KB = 0.114
+// 7 SMPTE 240M  KR = 0.212;  KB = 0.087
+// 10 bt2020     KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+    return (int) (v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+
+  if (argc < 2) {
+    printf("color kr kb\n");
+    return -1;
+  }
+  float kr = atof(argv[1]);
+  float kb = atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+//  printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("Limited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+//  printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  return 0;
+}
+
author	Frank Barchard <fbarchard@google.com>	2021-02-05 16:14:25 -0800
committer	Frank Barchard <fbarchard@chromium.org>	2021-02-06 00:26:55 +0000
commit	942c5084482d8592883be66151e0dea502f4cbc0 (patch)
tree	e02d37c46fb6e9d06d5165fc86ce483b70928524
parent	60d37a064bc0307017537ed3091b1b0204213855 (diff)
download	libyuv-942c5084482d8592883be66151e0dea502f4cbc0.tar.gz