284 files changed, 92486 insertions, 56420 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..59d48705
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+# Defines the Chromium style for automatic reformatting.
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: Chromium
+---
+Language: Java
+BasedOnStyle: Google
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..20d679b7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+*.pyc
+.landmines
+pin-log.txt
+/base
+/build
+/buildtools
+/google_apis
+/links
+/links.db
+/ios
+/mojo
+/native_client
+/net
+/out
+/unit_test/out
+/source/out
+/sde-avx-sse-transition-out.txt
+/testing
+/third_party
+/tools
+
+# Files generated by CMake build
+cmake_install.cmake
+CMakeCache.txt
+CMakeFiles/
+yuvconvert
+libgtest.a
+libyuv.a
+libyuv_unittest
+
+# Files generated by winarm.mk build
+libyuv_arm.lib
+source/*.o
+
+# Files generated by perf
+perf.data
+perf.data.old
diff --git a/files/.gn b/.gn
index 63dad32d..f9a5ee6c 100644
--- a/files/.gn
+++ b/.gn
@@ -6,9 +6,15 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+import("//build/dotfile_settings.gni")
+
 # The location of the build configuration file.
 buildconfig = "//build/config/BUILDCONFIG.gn"
 
+# The python interpreter to use by default. On Windows, this will look
+# for python3.exe and python3.bat.
+script_executable = "python3"
+
 # The secondary source root is a parallel directory tree where
 # GN build files are placed when they can not be placed directly
 # in the source tree, e.g. for third party source trees.
@@ -23,24 +29,10 @@ check_targets = [ "//libyuv/*" ]
 # These are the list of GN files that run exec_script. This whitelist exists
 # to force additional review for new uses of exec_script, which is strongly
 # discouraged except for gypi_to_gn calls.
-exec_script_whitelist = [
-  "//build/config/BUILD.gn",
-  "//build/config/android/BUILD.gn",
-  "//build/config/android/config.gni",
-  "//build/config/android/internal_rules.gni",
-  "//build/config/android/rules.gni",
-  "//build/config/compiler/BUILD.gn",
-  "//build/config/gcc/gcc_version.gni",
-  "//build/config/ios/ios_sdk.gni",
-  "//build/config/linux/BUILD.gn",
-  "//build/config/linux/pkg_config.gni",
-  "//build/config/mac/mac_sdk.gni",
-  "//build/config/posix/BUILD.gn",
-  "//build/config/sysroot.gni",
-  "//build/config/win/visual_studio_version.gni",
-  "//build/gn_helpers.py",
-  "//build/gypi_to_gn.py",
-  "//build/toolchain/gcc_toolchain.gni",
-  "//build/toolchain/mac/BUILD.gn",
-  "//build/toolchain/win/BUILD.gn",
-]
+exec_script_whitelist = build_dotfile_settings.exec_script_whitelist +
+                        [ "//build_overrides/build.gni" ]
+
+default_args = {
+  mac_sdk_min = "10.12"
+  ios_deployment_target = "12.0"
+}
diff --git a/.vpython b/.vpython
new file mode 100644
index 00000000..4a64fd21
--- /dev/null
+++ b/.vpython
@@ -0,0 +1,52 @@
+# This is a vpython "spec" file.
+#
+# It describes patterns for python wheel dependencies of the python scripts in
+# the chromium repo, particularly for dependencies that have compiled components
+# (since pure-python dependencies can be easily vendored into third_party).
+#
+# When vpython is invoked, it finds this file and builds a python VirtualEnv,
+# containing all of the dependencies described in this file, fetching them from
+# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`,
+# this never requires the end-user machine to have a working python extension
+# compilation environment. All of these packages are built using:
+#   https://chromium.googlesource.com/infra/infra/+/master/infra/tools/dockerbuild/
+#
+# All python scripts in the repo share this same spec, to avoid dependency
+# fragmentation.
+#
+# If you have depot_tools installed in your $PATH, you can invoke python scripts
+# in this repo by running them as you normally would run them, except
+# substituting `vpython` instead of `python` on the command line, e.g.:
+#   vpython path/to/script.py some --arguments
+#
+# Read more about `vpython` and how to modify this file here:
+#   https://chromium.googlesource.com/infra/infra/+/master/doc/users/vpython.md
+
+python_version: "2.7"
+
+# Used by:
+#   third_party/catapult
+wheel: <
+  name: "infra/python/wheels/psutil/${platform}_${py_python}_${py_abi}"
+  version: "version:5.2.2"
+>
+
+# Used by:
+#   third_party/catapult
+wheel: <
+  name: "infra/python/wheels/pypiwin32/${vpython_platform}"
+  version: "version:219"
+  match_tag: <
+    platform: "win32"
+  >
+  match_tag: <
+    platform: "win_amd64"
+  >
+>
+
+# Used by:
+#   build/android
+wheel: <
+  name: "infra/python/wheels/requests-py2_py3"
+  version: "version:2.13.0"
+>
diff --git a/.vpython3 b/.vpython3
new file mode 100644
index 00000000..28d819e7
--- /dev/null
+++ b/.vpython3
@@ -0,0 +1,405 @@
+# This is a vpython "spec" file.
+#
+# It describes patterns for python wheel dependencies of the python scripts in
+# the chromium repo, particularly for dependencies that have compiled components
+# (since pure-python dependencies can be easily vendored into third_party).
+#
+# When vpython is invoked, it finds this file and builds a python VirtualEnv,
+# containing all of the dependencies described in this file, fetching them from
+# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`,
+# this never requires the end-user machine to have a working python extension
+# compilation environment. All of these packages are built using:
+#   https://chromium.googlesource.com/infra/infra/+/main/infra/tools/dockerbuild/
+#
+# All python scripts in the repo share this same spec, to avoid dependency
+# fragmentation.
+#
+# If you have depot_tools installed in your $PATH, you can invoke python scripts
+# in this repo by running them as you normally would run them, except
+# substituting `vpython` instead of `python` on the command line, e.g.:
+#   vpython path/to/script.py some --arguments
+#
+# Read more about `vpython` and how to modify this file here:
+#   https://chromium.googlesource.com/infra/infra/+/main/doc/users/vpython.md
+
+python_version: "3.8"
+
+# The default set of platforms vpython checks does not yet include mac-arm64.
+# Setting `verify_pep425_tag` to the list of platforms we explicitly must support
+# allows us to ensure that vpython specs stay mac-arm64-friendly
+verify_pep425_tag: [
+    {python: "cp38", abi: "cp38", platform: "manylinux1_x86_64"},
+    {python: "cp38", abi: "cp38", platform: "linux_arm64"},
+
+    {python: "cp38", abi: "cp38", platform: "macosx_10_10_intel"},
+    {python: "cp38", abi: "cp38", platform: "macosx_11_0_arm64"},
+
+    {python: "cp38", abi: "cp38", platform: "win32"},
+    {python: "cp38", abi: "cp38", platform: "win_amd64"}
+]
+
+# Used by:
+#   build/android/pylib/local/emulator/avd.py
+#   components/policy/test_support/policy_testserver.py
+wheel: <
+  name: "infra/python/wheels/protobuf-py2_py3"
+  version: "version:3.15.8"
+>
+
+# TODO(https://crbug.com/898348): Add in necessary wheels as Python3 versions
+# become available.
+wheel: <
+  name: "infra/python/wheels/six-py2_py3"
+  version: "version:1.15.0"
+>
+
+# Common utilities.
+# Use the same versions specified by //third_party/catapult/.vpython3 so that
+# Chromium tests using Telemetry function properly.
+wheel: <
+  name: "infra/python/wheels/numpy/${vpython_platform}"
+  version: "version:1.20.3"
+  # A newer version of numpy is required on ARM64, but it breaks older OS versions.
+  not_match_tag <
+    platform: "macosx_11_0_arm64"
+  >
+>
+wheel: <
+  name: "infra/python/wheels/numpy/mac-arm64_cp38_cp38"
+  version: "version:1.21.1"
+  match_tag <
+    platform: "macosx_11_0_arm64"
+  >
+>
+wheel: <
+  name: "infra/python/wheels/psutil/${vpython_platform}"
+  version: "version:5.8.0.chromium.2"
+>
+wheel: <
+  name: "infra/python/wheels/requests-py3"
+  version: "version:2.31.0"
+>
+
+# Used by various python unit tests.
+wheel: <
+  name: "infra/python/wheels/mock-py2_py3"
+  version: "version:2.0.0"
+>
+wheel: <
+  name: "infra/python/wheels/parameterized-py2_py3"
+  version: "version:0.7.1"
+>
+wheel: <
+  name: "infra/python/wheels/pbr-py2_py3"
+  version: "version:3.0.0"
+>
+
+wheel: <
+  name: "infra/python/wheels/pyfakefs-py2_py3"
+  version: "version:3.7.2"
+>
+
+# Used by:
+#   build/chromeos/test_runner.py
+wheel: <
+  name: "infra/python/wheels/jsonlines-py2_py3"
+  version: "version:1.2.0"
+>
+wheel: <
+  name: "infra/python/wheels/python-dateutil-py2_py3"
+  version: "version:2.7.3"
+>
+
+# Used by WPT importer
+wheel: <
+  name: "infra/python/wheels/charset_normalizer-py3"
+  version: "version:2.0.4"
+>
+wheel: <
+  name: "infra/python/wheels/pyasn1-py2_py3"
+  version: "version:0.4.5"
+>
+wheel: <
+  name: "infra/python/wheels/pyasn1_modules-py2_py3"
+  version: "version:0.2.4"
+>
+wheel: <
+  name: "infra/python/wheels/rsa-py2_py3"
+  version: "version:3.4.2"
+>
+wheel: <
+  name: "infra/python/wheels/cachetools-py2_py3"
+  version: "version:2.0.1"
+>
+wheel: <
+  name: "infra/python/wheels/uritemplate-py2_py3"
+  version: "version:3.0.0"
+>
+wheel: <
+  name: "infra/python/wheels/google-auth-py2_py3"
+  version: "version:1.25.0"
+>
+wheel: <
+  name: "infra/python/wheels/googleapis-common-protos-py2_py3"
+  version: "version:1.52.0"
+>
+wheel: <
+  name: "infra/python/wheels/google-api-core-py2_py3"
+  version: "version:1.25.1"
+>
+wheel: <
+  name: "infra/python/wheels/google-auth-httplib2-py2_py3"
+  version: "version:0.1.0"
+>
+wheel: <
+  name: "infra/python/wheels/google-api-python-client-py3"
+  version: "version:2.2.0"
+>
+wheel: <
+  name: "infra/python/wheels/oauth2client-py2_py3"
+  version: "version:3.0.0"
+>
+
+# Used by Web Platform Tests (WPT) codebase in
+# //third_party/blink/web_tests/external/wpt/tools/
+wheel: <
+  name: "infra/python/wheels/html5lib-py2_py3"
+  version: "version:1.0.1"
+>
+wheel: <
+  name: "infra/python/wheels/mozdebug-py2_py3"
+  version: "version:0.2"
+>
+wheel: <
+  name: "infra/python/wheels/mozinfo-py2_py3"
+  version: "version:1.2.2"
+>
+wheel: <
+  name: "infra/python/wheels/mozlog-py2_py3"
+  version: "version:7.1.0"
+>
+wheel: <
+  name: "infra/python/wheels/mozprocess-py2_py3"
+  version: "version:1.2.1"
+>
+wheel: <
+  name: "infra/python/wheels/urllib3-py2_py3"
+  version: "version:1.24.3"
+>
+wheel: <
+  name: "infra/python/wheels/blessings-py2_py3"
+  version: "version:1.7"
+>
+wheel: <
+  name: "infra/python/wheels/mozfile-py2_py3"
+  version: "version:2.0.0"
+>
+wheel: <
+  name: "infra/python/wheels/mozterm-py2_py3"
+  version: "version:1.0.0"
+>
+wheel: <
+  name: "infra/python/wheels/webencodings-py2_py3"
+  version: "version:0.5.1"
+>
+wheel: <
+  name: "infra/python/wheels/certifi-py2_py3"
+  version: "version:2020.11.8"
+>
+wheel: <
+  name: "infra/python/wheels/chardet-py2_py3"
+  version: "version:3.0.4"
+>
+wheel: <
+  name: "infra/python/wheels/idna-py2_py3"
+  version: "version:2.8"
+>
+wheel: <
+  name: "infra/python/wheels/distro-py2_py3"
+  version: "version:1.4.0"
+>
+wheel: <
+  name: "infra/python/wheels/pillow/linux-amd64_cp38_cp38"
+  version: "version:8.1.2"
+>
+wheel: <
+  name: "infra/python/wheels/aioquic/${vpython_platform}"
+  version: "version:0.9.15"
+>
+wheel: <
+  name: "infra/python/wheels/pylsqpack/${vpython_platform}"
+  version: "version:0.3.12"
+>
+wheel: <
+  name: "infra/python/wheels/cryptography/${vpython_platform}"
+  version: "version:3.3.1.chromium.1"
+>
+wheel: <
+  name: "infra/python/wheels/cffi/${vpython_platform}"
+  version: "version:1.14.5"
+>
+wheel: <
+  name: "infra/python/wheels/pycparser-py2_py3"
+  version: "version:2.19"
+>
+
+# Used by:
+#   chrome/test/chromedriver/test/run_webdriver_tests.py
+wheel: <
+  name: "infra/python/wheels/iniconfig-py3"
+  version: "version:1.1.1"
+>
+
+wheel: <
+  name: "infra/python/wheels/packaging-py2_py3"
+  version: "version:16.8"
+>
+
+wheel: <
+  name: "infra/python/wheels/pyparsing-py2_py3"
+  version: "version:2.4.7"
+>
+
+wheel: <
+  name: "infra/python/wheels/toml-py3"
+  version: "version:0.10.1"
+>
+
+wheel <
+  name: "infra/python/wheels/pytest-py3"
+  version: "version:6.2.2"
+>
+
+wheel <
+  name: "infra/python/wheels/pytest-asyncio-py3"
+  version: "version:0.14.0"
+>
+
+wheel <
+  name: "infra/python/wheels/attrs-py2_py3"
+  version: "version:20.3.0"
+>
+
+wheel <
+  name: "infra/python/wheels/six-py2_py3"
+  version: "version:1.15.0"
+>
+
+wheel <
+  name: "infra/python/wheels/more-itertools-py2_py3"
+  version: "version:4.1.0"
+>
+
+wheel <
+  name: "infra/python/wheels/pluggy-py3"
+  version: "version:0.13.1"
+>
+
+wheel <
+  name: "infra/python/wheels/py-py2_py3"
+  version: "version:1.10.0"
+>
+
+wheel <
+  name: "infra/python/wheels/funcsigs-py2_py3"
+  version: "version:1.0.2"
+>
+
+wheel: <
+  name: "infra/python/wheels/atomicwrites-py2_py3"
+  version: "version:1.3.0"
+>
+
+wheel: <
+  name: "infra/python/wheels/colorama-py2_py3"
+  version: "version:0.4.1"
+>
+
+# Used by:
+#   testing/buildbot/generate_buildbot_json_coveragetest.py
+wheel: <
+  name: "infra/python/wheels/coverage/${vpython_platform}"
+  version: "version:5.5.chromium.2"
+>
+
+# Used by:
+#   //content/test/gpu
+wheel: <
+  name: "infra/python/wheels/pathos/${vpython_platform}"
+  version: "version:0.2.7.chromium.4"
+  not_match_tag <
+    abi: "cp27mu"
+    platform: "manylinux1_i686"
+  >
+  not_match_tag <
+    abi: "cp27mu"
+    platform: "linux_mips64"
+  >
+  not_match_tag <
+    abi: "cp27mu"
+    platform: "linux_armv6l"
+  >
+  not_match_tag <
+    abi: "cp27mu"
+    platform: "linux_armv7l"
+  >
+>
+
+# Used by:
+#   //tools/infra/find_bad_builds.py
+wheel: <
+  name: "infra/python/wheels/pytz-py2_py3"
+  version: "version:2018.4"
+>
+
+# Used by:
+#   //third_party/blink/tools/blinkpy/web_tests/port/server_process.py
+wheel: <
+  name: "infra/python/wheels/pywin32/${vpython_platform}"
+  version: "version:300"
+  match_tag: <
+    platform: "win32"
+  >
+  match_tag: <
+    platform: "win_amd64"
+  >
+>
+
+# Used by:
+#   //content/test/gpu/gpu_tests/color_profile_manager_mac.py
+wheel: <
+  name: "infra/python/wheels/pyobjc/${vpython_platform}"
+  version: "version:7.3.chromium.1"
+  match_tag: <
+    platform: "macosx_10_10_intel"
+  >
+>
+
+# Used by:
+#   tools/perf/core/results_dashboard.py
+wheel: <
+  name: "infra/python/wheels/httplib2-py3"
+  version: "version:0.19.1"
+>
+
+# Used by:
+#   tools/perf/flakiness_cli
+wheel: <
+  name: "infra/python/wheels/pandas/${vpython_platform}"
+  version: "version:1.3.2.chromium.1"
+  match_tag: <
+    platform: "win32"
+  >
+  match_tag: <
+    platform: "win_amd64"
+  >
+  match_tag: <
+    platform: "manylinux1_i686"
+  >
+  match_tag: <
+    platform: "manylinux1_x86_64"
+  >
+  match_tag: <
+    platform: "macosx_10_6_intel"
+  >
+>
diff --git a/files/AUTHORS b/AUTHORS
index 9686ac13..28c08956 100644
--- a/files/AUTHORS
+++ b/AUTHORS
@@ -2,3 +2,5 @@
 # Name or Organization <email address>
 
 Google Inc.
+
+Ivan Pavlotskiy <ivan.pavlotskiy@lgepartner.com>
diff --git a/Android.bp b/Android.bp
index e4ed511c..506184e0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1,7 +1,6 @@
 package {
     default_applicable_licenses: ["external_libyuv_license"],
 }
-
 // Added automatically by a large-scale-change
 // See: http://go/android-license-faq
 license {
@@ -12,7 +11,183 @@ license {
     ],
     license_text: [
         "LICENSE",
+        "PATENTS",
     ],
 }
-
 subdirs = ["files"]
+
+cc_library {
+    name: "libyuv",
+    vendor_available: true,
+    product_available: true,
+    host_supported: true,
+
+    srcs: [
+        "source/compare.cc",
+        "source/compare_common.cc",
+        "source/compare_gcc.cc",
+        "source/compare_msa.cc",
+        "source/compare_neon.cc",
+        "source/compare_neon64.cc",
+        "source/convert.cc",
+        "source/convert_argb.cc",
+        "source/convert_from.cc",
+        "source/convert_from_argb.cc",
+        "source/convert_jpeg.cc",
+        "source/convert_to_argb.cc",
+        "source/convert_to_i420.cc",
+        "source/cpu_id.cc",
+        "source/mjpeg_decoder.cc",
+        "source/mjpeg_validate.cc",
+        "source/planar_functions.cc",
+        "source/rotate.cc",
+        "source/rotate_any.cc",
+        "source/rotate_argb.cc",
+        "source/rotate_common.cc",
+        "source/rotate_gcc.cc",
+        "source/rotate_msa.cc",
+        "source/rotate_neon.cc",
+        "source/rotate_neon64.cc",
+        "source/row_any.cc",
+        "source/row_common.cc",
+        "source/row_gcc.cc",
+        "source/row_msa.cc",
+        "source/row_neon.cc",
+        "source/row_neon64.cc",
+        "source/row_rvv.cc",
+        "source/scale.cc",
+        "source/scale_any.cc",
+        "source/scale_argb.cc",
+        "source/scale_common.cc",
+        "source/scale_gcc.cc",
+        "source/scale_msa.cc",
+        "source/scale_neon.cc",
+        "source/scale_neon64.cc",
+        "source/scale_rgb.cc",
+        "source/scale_rvv.cc",
+        "source/scale_uv.cc",
+        "source/video_common.cc",
+    ],
+
+    cflags: [
+        "-Wall",
+        "-Werror",
+        "-Wno-unused-parameter",
+        "-fexceptions",
+        "-DHAVE_JPEG",
+        "-DLIBYUV_UNLIMITED_DATA",
+    ],
+
+    arch: {
+        arm: {
+            cflags: ["-mfpu=neon"],
+        },
+    },
+
+    shared_libs: ["libjpeg"],
+
+    export_include_dirs: ["include"],
+
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.media.swcodec",
+        "com.android.virt",
+    ],
+    min_sdk_version: "29",
+}
+
+// compatibilty static library until all uses of libyuv_static are replaced
+// with libyuv (b/37646797)
+cc_library_static {
+    name: "libyuv_static",
+    vendor_available: true,
+    whole_static_libs: ["libyuv"],
+    apex_available: [
+        "//apex_available:platform",
+        "com.android.media.swcodec",
+    ],
+    min_sdk_version: "29",
+}
+
+cc_test {
+    name: "libyuv_unittest",
+    static_libs: ["libyuv"],
+    shared_libs: ["libjpeg"],
+    cflags: ["-Wall", "-Werror"],
+    srcs: [
+        "unit_test/basictypes_test.cc",
+        "unit_test/color_test.cc",
+        "unit_test/compare_test.cc",
+        "unit_test/convert_test.cc",
+        "unit_test/cpu_test.cc",
+        "unit_test/cpu_thread_test.cc",
+        "unit_test/math_test.cc",
+        "unit_test/planar_test.cc",
+        "unit_test/rotate_argb_test.cc",
+        "unit_test/rotate_test.cc",
+        "unit_test/scale_argb_test.cc",
+        "unit_test/scale_plane_test.cc",
+        "unit_test/scale_rgb_test.cc",
+        "unit_test/scale_test.cc",
+        "unit_test/scale_uv_test.cc",
+        "unit_test/unit_test.cc",
+        "unit_test/video_common_test.cc",
+    ],
+}
+
+cc_test {
+    name: "compare",
+    gtest: false,
+    srcs: [
+        "util/compare.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "cpuid",
+    gtest: false,
+    srcs: [
+        "util/cpuid.c",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "i444tonv12_eg",
+    gtest: false,
+    srcs: [
+        "util/i444tonv12_eg.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "psnr",
+    gtest: false,
+    srcs: [
+        "util/psnr_main.cc",
+        "util/psnr.cc",
+        "util/ssim.cc",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "yuvconstants",
+    gtest: false,
+    srcs: [
+        "util/yuvconstants.c",
+    ],
+    static_libs: ["libyuv"],
+}
+
+cc_test {
+    name: "yuvconvert",
+    gtest: false,
+    srcs: [
+        "util/yuvconvert.cc",
+    ],
+    static_libs: ["libyuv"],
+    shared_libs: ["libjpeg"],
+}
diff --git a/BUILD b/BUILD
deleted file mode 100644
index 3145e36a..00000000
--- a/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright 2011 Google Inc. All Rights Reserved.
-#
-# Description:
-#   The libyuv package provides implementation yuv image conversion and
-#   scaling.
-#
-#   This library is used by Talk Video and WebRTC.
-#
-
-licenses(['notice'])  #  3-clause BSD
-
-exports_files(['LICENSE'])
-
-package(default_visibility = ['//visibility:public'])
diff --git a/files/BUILD.gn b/BUILD.gn
index 8904fd6c..2c600b22 100644
--- a/files/BUILD.gn
+++ b/BUILD.gn
@@ -6,12 +6,13 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import("libyuv.gni")
+import("//build/config/features.gni")
 import("//testing/test.gni")
+import("libyuv.gni")
 
 declare_args() {
-  # Set to false to disable building with gflags.
-  libyuv_use_gflags = true
+  # Set to false to disable building with absl flags.
+  libyuv_use_absl_flags = true
 
   # When building a shared library using a target in WebRTC or
   # Chromium projects that depends on libyuv, setting this flag
@@ -21,26 +22,40 @@ declare_args() {
 
 config("libyuv_config") {
   include_dirs = [ "include" ]
-  if (is_android && current_cpu == "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+  if (is_android) {
+    if (target_cpu == "arm" || target_cpu == "x86" || target_cpu == "mipsel") {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+    } else {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+    }
+  }
+  defines = []
+  if (!libyuv_use_neon) {
+    defines += [ "LIBYUV_DISABLE_NEON" ]
+  }
+  if (libyuv_disable_rvv) {
+    defines += [ "LIBYUV_DISABLE_RVV" ]
+  }
+  if (!libyuv_use_lsx) {
+    defines += [ "LIBYUV_DISABLE_LSX" ]
   }
-  if (is_android && current_cpu != "arm64") {
-    ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+  if (!libyuv_use_lasx) {
+    defines += [ "LIBYUV_DISABLE_LASX" ]
   }
 }
 
 # This target is built when no specific target is specified on the command line.
 group("default") {
   testonly = true
-  deps = [
-    ":libyuv",
-  ]
+  deps = [ ":libyuv" ]
   if (libyuv_include_tests) {
     deps += [
       ":compare",
       ":cpuid",
+      ":i444tonv12_eg",
       ":libyuv_unittest",
       ":psnr",
+      ":yuvconstants",
       ":yuvconvert",
     ]
   }
@@ -52,13 +67,9 @@ group("libyuv") {
 
   if (is_win && target_cpu == "x64") {
     # Compile with clang in order to get inline assembly
-    public_deps = [
-      ":libyuv_internal(//build/toolchain/win:win_clang_x64)",
-    ]
+    public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ]
   } else {
-    public_deps = [
-      ":libyuv_internal",
-    ]
+    public_deps = [ ":libyuv_internal" ]
   }
 
   if (libyuv_use_neon) {
@@ -69,11 +80,15 @@ group("libyuv") {
     deps += [ ":libyuv_msa" ]
   }
 
-  if (libyuv_use_mmi) {
-    deps += [ ":libyuv_mmi" ]
+  if (libyuv_use_lsx) {
+    deps += [ ":libyuv_lsx" ]
   }
 
-  if (!is_ios) {
+  if (libyuv_use_lasx) {
+    deps += [ ":libyuv_lasx" ]
+  }
+
+  if (!is_ios && !libyuv_disable_jpeg) {
     # Make sure that clients of libyuv link with libjpeg. This can't go in
     # libyuv_internal because in Windows x64 builds that will generate a clang
     # build of libjpeg, and we don't want two copies.
@@ -102,7 +117,9 @@ static_library("libyuv_internal") {
     "include/libyuv/row.h",
     "include/libyuv/scale.h",
     "include/libyuv/scale_argb.h",
+    "include/libyuv/scale_rgb.h",
     "include/libyuv/scale_row.h",
+    "include/libyuv/scale_uv.h",
     "include/libyuv/version.h",
     "include/libyuv/video_common.h",
 
@@ -131,12 +148,16 @@ static_library("libyuv_internal") {
     "source/row_any.cc",
     "source/row_common.cc",
     "source/row_gcc.cc",
+    "source/row_rvv.cc",
     "source/row_win.cc",
     "source/scale.cc",
     "source/scale_any.cc",
     "source/scale_argb.cc",
     "source/scale_common.cc",
     "source/scale_gcc.cc",
+    "source/scale_rgb.cc",
+    "source/scale_rvv.cc",
+    "source/scale_uv.cc",
     "source/scale_win.cc",
     "source/video_common.cc",
   ]
@@ -150,7 +171,7 @@ static_library("libyuv_internal") {
     configs += [ "//build/config/gcc:symbol_visibility_default" ]
   }
 
-  if (!is_ios) {
+  if ((!is_ios || use_blink) && !libyuv_disable_jpeg) {
     defines += [ "HAVE_JPEG" ]
 
     # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
@@ -192,9 +213,7 @@ if (libyuv_use_neon) {
       "source/scale_neon64.cc",
     ]
 
-    deps = [
-      ":libyuv_internal",
-    ]
+    deps = [ ":libyuv_internal" ]
 
     public_configs = [ ":libyuv_config" ]
 
@@ -225,28 +244,46 @@ if (libyuv_use_msa) {
       "source/scale_msa.cc",
     ]
 
-    deps = [
-      ":libyuv_internal",
+    deps = [ ":libyuv_internal" ]
+
+    public_configs = [ ":libyuv_config" ]
+  }
+}
+
+if (libyuv_use_lsx) {
+  static_library("libyuv_lsx") {
+    sources = [
+      # LSX Source Files
+      "source/rotate_lsx.cc",
+      "source/row_lsx.cc",
+      "source/scale_lsx.cc",
+    ]
+
+    cflags_cc = [
+      "-mlsx",
+      "-Wno-c++11-narrowing",
     ]
 
+    deps = [ ":libyuv_internal" ]
+
     public_configs = [ ":libyuv_config" ]
   }
 }
 
-if (libyuv_use_mmi) {
-  static_library("libyuv_mmi") {
+if (libyuv_use_lasx) {
+  static_library("libyuv_lasx") {
     sources = [
-      # MMI Source Files
-      "source/compare_mmi.cc",
-      "source/rotate_mmi.cc",
-      "source/row_mmi.cc",
-      "source/scale_mmi.cc",
+      # LASX Source Files
+      "source/row_lasx.cc",
     ]
 
-    deps = [
-      ":libyuv_internal",
+    cflags_cc = [
+      "-mlasx",
+      "-Wno-c++11-narrowing",
     ]
 
+    deps = [ ":libyuv_internal" ]
+
     public_configs = [ ":libyuv_config" ]
   }
 }
@@ -275,11 +312,10 @@ if (libyuv_include_tests) {
     testonly = true
 
     sources = [
-      # sources
-      # headers
       "unit_test/basictypes_test.cc",
       "unit_test/color_test.cc",
       "unit_test/compare_test.cc",
+      "unit_test/convert_argb_test.cc",
       "unit_test/convert_test.cc",
       "unit_test/cpu_test.cc",
       "unit_test/cpu_thread_test.cc",
@@ -288,7 +324,10 @@ if (libyuv_include_tests) {
       "unit_test/rotate_argb_test.cc",
       "unit_test/rotate_test.cc",
       "unit_test/scale_argb_test.cc",
+      "unit_test/scale_plane_test.cc",
+      "unit_test/scale_rgb_test.cc",
       "unit_test/scale_test.cc",
+      "unit_test/scale_uv_test.cc",
       "unit_test/unit_test.cc",
       "unit_test/unit_test.h",
       "unit_test/video_common_test.cc",
@@ -300,19 +339,20 @@ if (libyuv_include_tests) {
     ]
 
     defines = []
-    if (libyuv_use_gflags) {
-      defines += [ "LIBYUV_USE_GFLAGS" ]
-      deps += [ "//third_party/gflags" ]
+    if (libyuv_use_absl_flags) {
+      defines += [ "LIBYUV_USE_ABSL_FLAGS" ]
+      deps += [
+        "//third_party/abseil-cpp/absl/flags:flag",
+        "//third_party/abseil-cpp/absl/flags:parse",
+      ]
     }
 
     configs += [ ":libyuv_unittest_warnings_config" ]
 
-    public_deps = [
-      "//testing/gtest",
-    ]
+    public_deps = [ "//testing/gtest" ]
     public_configs = [ ":libyuv_unittest_config" ]
 
-    if (is_linux) {
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
     if (is_ios) {
@@ -349,10 +389,8 @@ if (libyuv_include_tests) {
       # sources
       "util/compare.cc",
     ]
-    deps = [
-      ":libyuv",
-    ]
-    if (is_linux) {
+    deps = [ ":libyuv" ]
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
   }
@@ -362,10 +400,19 @@ if (libyuv_include_tests) {
       # sources
       "util/yuvconvert.cc",
     ]
-    deps = [
-      ":libyuv",
+    deps = [ ":libyuv" ]
+    if (is_linux || is_chromeos) {
+      cflags = [ "-fexceptions" ]
+    }
+  }
+
+  executable("yuvconstants") {
+    sources = [
+      # sources
+      "util/yuvconstants.c",
     ]
-    if (is_linux) {
+    deps = [ ":libyuv" ]
+    if (is_linux || is_chromeos) {
       cflags = [ "-fexceptions" ]
     }
   }
@@ -377,22 +424,26 @@ if (libyuv_include_tests) {
       "util/psnr_main.cc",
       "util/ssim.cc",
     ]
-    deps = [
-      ":libyuv",
-    ]
+    deps = [ ":libyuv" ]
 
     if (!is_ios && !libyuv_disable_jpeg) {
       defines = [ "HAVE_JPEG" ]
     }
   }
 
+  executable("i444tonv12_eg") {
+    sources = [
+      # sources
+      "util/i444tonv12_eg.cc",
+    ]
+    deps = [ ":libyuv" ]
+  }
+
   executable("cpuid") {
     sources = [
       # sources
       "util/cpuid.c",
     ]
-    deps = [
-      ":libyuv",
-    ]
+    deps = [ ":libyuv" ]
   }
 }
diff --git a/files/CM_linux_packages.cmake b/CM_linux_packages.cmake
index 5f676f89..a073edfa 100644
--- a/files/CM_linux_packages.cmake
+++ b/CM_linux_packages.cmake
@@ -8,7 +8,7 @@ SET ( YUV_VER_MAJOR 0 )
 SET ( YUV_VER_MINOR 0 )
 SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} )
 SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} )
-MESSAGE ( "Building ver.: ${YUV_VERSION}" )
+MESSAGE ( VERBOSE "Building ver.: ${YUV_VERSION}" )
 
 # is this a 32-bit or 64-bit build?
 IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
@@ -45,7 +45,7 @@ ELSE ()
 		SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" )
 	ENDIF ()
 ENDIF ()
-MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" )
+MESSAGE ( VERBOSE "Packaging for: ${YUV_SYSTEM_NAME}" )
 
 # define all the variables needed by CPack to create .deb and .rpm packages
 SET ( CPACK_PACKAGE_VENDOR					"Frank Barchard" )
diff --git a/files/CMakeLists.txt b/CMakeLists.txt
index 188a26b7..9abfa74b 100644
--- a/files/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 PROJECT ( YUV C CXX )	# "C" is required even for C++ projects
 CMAKE_MINIMUM_REQUIRED( VERSION 2.8.12 )
-OPTION( TEST "Built unit tests" OFF )
+OPTION( UNIT_TEST "Built unit tests" OFF )
 
 SET ( ly_base_dir	${PROJECT_SOURCE_DIR} )
 SET ( ly_src_dir	${ly_base_dir}/source )
@@ -22,6 +22,10 @@ LIST ( SORT			ly_unittest_sources )
 
 INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} )
 
+if(MSVC)
+  ADD_DEFINITIONS ( -D_CRT_SECURE_NO_WARNINGS )
+endif()
+
 # this creates the static library (.a)
 ADD_LIBRARY				( ${ly_lib_static} STATIC ${ly_source_files} )
 
@@ -29,23 +33,36 @@ ADD_LIBRARY				( ${ly_lib_static} STATIC ${ly_source_files} )
 ADD_LIBRARY				( ${ly_lib_shared} SHARED ${ly_source_files} )
 SET_TARGET_PROPERTIES	( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" )
 SET_TARGET_PROPERTIES	( ${ly_lib_shared} PROPERTIES PREFIX "lib" )
+if(WIN32)
+  SET_TARGET_PROPERTIES	( ${ly_lib_shared} PROPERTIES IMPORT_PREFIX "lib" )
+endif()
+
+# this creates the cpuid tool
+ADD_EXECUTABLE      ( cpuid ${ly_base_dir}/util/cpuid.c )
+TARGET_LINK_LIBRARIES  ( cpuid ${ly_lib_static} )
 
 # this creates the conversion tool
 ADD_EXECUTABLE			( yuvconvert ${ly_base_dir}/util/yuvconvert.cc )
 TARGET_LINK_LIBRARIES	( yuvconvert ${ly_lib_static} )
 
+# this creates the yuvconstants tool
+ADD_EXECUTABLE      ( yuvconstants ${ly_base_dir}/util/yuvconstants.c )
+TARGET_LINK_LIBRARIES  ( yuvconstants ${ly_lib_static} )
 
-INCLUDE ( FindJPEG )
+find_package ( JPEG )
 if (JPEG_FOUND)
   include_directories( ${JPEG_INCLUDE_DIR} )
-  target_link_libraries( yuvconvert ${JPEG_LIBRARY} )
+  target_link_libraries( ${ly_lib_shared} ${JPEG_LIBRARY} )
   add_definitions( -DHAVE_JPEG )
 endif()
 
-if(TEST)
+if(UNIT_TEST)
   find_library(GTEST_LIBRARY gtest)
   if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
     set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if (CMAKE_CROSSCOMPILING)
+      set(GTEST_SRC_DIR third_party/googletest/src/googletest)
+    endif()
     if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
       message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
       set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
@@ -54,7 +71,7 @@ if(TEST)
       include_directories(${GTEST_SRC_DIR}/include)
       set(GTEST_LIBRARY gtest)
     else()
-      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+      message(FATAL_ERROR "UNIT_TEST is set but unable to find gtest library")
     endif()
   endif()
 
diff --git a/DEPS b/DEPS
new file mode 100644
index 00000000..70ed1d58
--- /dev/null
+++ b/DEPS
@@ -0,0 +1,2576 @@
+gclient_gn_args_file = 'src/build/config/gclient_args.gni'
+gclient_gn_args = [
+  'generate_location_tags',
+]
+
+vars = {
+  'chromium_git': 'https://chromium.googlesource.com',
+  'chromium_revision': 'af3d01376bec75a68f90160bfd38057d60510a2b',
+  'gn_version': 'git_revision:fae280eabe5d31accc53100137459ece19a7a295',
+  # ninja CIPD package version.
+  # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja
+  'ninja_version': 'version:2@1.11.1.chromium.6',
+  # reclient CIPD package version
+  'reclient_version': 're_client_version:0.110.0.43ec6b1-gomaip',
+
+  # Keep the Chromium default of generating location tags.
+  'generate_location_tags': True,
+
+  # By default, download the fuchsia sdk from the public sdk directory.
+  'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/core/',
+  'fuchsia_version': 'version:15.20230909.2.1',
+  # By default, download the fuchsia images from the fuchsia GCS bucket.
+  'fuchsia_images_bucket': 'fuchsia',
+  'checkout_fuchsia': False,
+  # Since the images are hundreds of MB, default to only downloading the image
+  # most commonly useful for developers. Bots and developers that need to use
+  # other images can override this with additional images.
+  'checkout_fuchsia_boot_images': "terminal.qemu-x64,terminal.x64",
+  'checkout_fuchsia_product_bundles': '"{checkout_fuchsia_boot_images}" != ""',
+}
+
+deps = {
+  'src/build':
+    Var('chromium_git') + '/chromium/src/build' + '@' + '5885d3c24833ad72845a52a1b913a2b8bc651b56',
+  'src/buildtools':
+    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '79ab87fa54614258c4c95891e873223371194525',
+  'src/testing':
+    Var('chromium_git') + '/chromium/src/testing' + '@' + '51e9a02297057cc0e917763a51e16680b7d16fb6',
+  'src/third_party':
+    Var('chromium_git') + '/chromium/src/third_party' + '@' + '2dc4b18abd1003ce7b1eda509dc96f12d49a9667',
+
+  'src/buildtools/linux64': {
+    'packages': [
+      {
+        'package': 'gn/gn/linux-${{arch}}',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'host_os == "linux"',
+  },
+
+  'src/buildtools/mac': {
+    'packages': [
+      {
+        'package': 'gn/gn/mac-${{arch}}',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'host_os == "mac"',
+  },
+
+  'src/buildtools/win': {
+    'packages': [
+      {
+        'package': 'gn/gn/windows-amd64',
+        'version': Var('gn_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'host_os == "win"',
+  },
+
+  'src/buildtools/reclient': {
+    'packages': [
+      {
+        'package': 'infra/rbe/client/${{platform}}',
+        'version': Var('reclient_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+  },
+
+  'src/third_party/catapult':
+    Var('chromium_git') + '/catapult.git' + '@' + 'fa05d995e152efdae488a2aeba397cd609fdbc9d',
+  'src/third_party/clang-format/script':
+      Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef',
+  'src/third_party/colorama/src':
+    Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49',
+  'src/third_party/cpu_features/src': {
+    'url': Var('chromium_git') + '/external/github.com/google/cpu_features.git' + '@' + '936b9ab5515dead115606559502e3864958f7f6e',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/depot_tools':
+    Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + 'd3e43dd4319ba169c0aaf44547eecf861f2fe5da',
+  'src/third_party/freetype/src':
+    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '9e3c5d7e183c1a8d5ed8868d7d28ef18d3ec9ec8',
+  'third_party/fuchsia-gn-sdk': {
+    'url': Var('chromium_git') + '/chromium/src/third_party/fuchsia-gn-sdk.git' + '@' + '0d6902558d92fe3d49ba9a8f638ddea829be595b',
+    'condition': 'checkout_fuchsia',
+  },
+  'src/third_party/googletest/src':
+    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07',
+  'src/third_party/harfbuzz-ng/src':
+    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'db700b5670d9475cc8ed4880cc9447b232c5e432',
+  'src/third_party/libc++/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '84fb809dd6dae36d556dc0bb702c6cc2ce9d4b80',
+  'src/third_party/libc++abi/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '8d21803b9076b16d46c32e2f10da191ee758520c',
+  'src/third_party/libunwind/src':
+    Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f1c687e0aaf0d70b9a53a150e9be5cb63af9215f',
+  'src/third_party/libjpeg_turbo':
+    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '30bdb85e302ecfc52593636b2f44af438e05e784',
+  'src/third_party/nasm':
+    Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce',
+  'src/tools':
+    Var('chromium_git') + '/chromium/src/tools' + '@' + 'a76c0dbb64c603a0d45e0c6dfae3a351b6e1adf1',
+
+  # libyuv-only dependencies (not present in Chromium).
+  'src/third_party/gtest-parallel':
+    Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
+
+  'src/third_party/lss': {
+    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + 'ce877209e11aa69dcfffbd53ef90ea1d07136521',
+    'condition': 'checkout_android or checkout_linux',
+  },
+
+  # Android deps:
+  'src/third_party/accessibility_test_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/accessibility-test-framework',
+              'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/kotlin_stdlib': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlin_stdlib',
+              'version': 'Z1gsqhL967kFQecxKrRwXHbl-vwQjpv0l7PMUZ0EVO8C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/kotlinc/current': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/kotlinc',
+              'version': 'Rr02Gf2EkaeSs3EhSUHhPqDHSd1AzimrM6cRYUJCPjQC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/boringssl/src':
+    'https://boringssl.googlesource.com/boringssl.git' + '@' + '20a06474c0b4a16779311bfe98ba69dc2402101d',
+  'src/base': {
+    'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'd407b7061bce341bb6e11b539ea86c46c949ac4c',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/bazel': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/bazel',
+              'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/bouncycastle': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/bouncycastle',
+              'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/android_toolchain': {
+      'packages': [
+            {
+                'package': 'chromium/third_party/android_toolchain/android_toolchain',
+                'version': 'R_8suM8m0oHbZ1awdxGXvKEFpAOETscbfZxkkMthyk8C',
+            },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/androidx': {
+    'packages': [
+      {
+          'package': 'chromium/third_party/androidx',
+          'version': 'y7rF_rx56mD3FGhMiqnlbQ6HOqHJ95xUFNX1m-_a988C',
+      },
+    ],
+    'condition': 'checkout_android',
+    'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_support_test_runner': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_support_test_runner',
+              'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/android_sdk/public': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools/34.0.0',
+              'version': 'YK9Rzw3fDzMHVzatNN6VlyoD_81amLZpN1AbmkdOd6AC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/emulator',
+              'version': '9lGp8nTUCRRWGMnI_96HcKfzjnxEJKUcfvfwmA3wXNkC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/patcher',
+              'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platform-tools',
+              'version': 'HWVsGs2HCKgSVv41FsOcsfJbNcB0UFiNrF6Tc4yRArYC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-34',
+              'version': 'u-bhWbTME6u-DjypTgr3ZikCyeAeU6txkR9ET6Uudc8C',
+          },
+   {
+              'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox',
+              'version': 'YWMYkzyxGBgVsty0GhXL1oxbY0pGXQIgFc0Rh7ZMRPYC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/sources/android-31',
+              'version': '_a_BcnANjPYw5mSKlNHa7GFY8yc1kdqj2rmQgac7yUcC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/cmdline-tools',
+              'version': 'EWnL2r7oV5GtE9Ef7GyohyFam42wtMtEKYU4dCb3U1YC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/tools/clang/dsymutil': {
+    'packages': [
+      {
+        'package': 'chromium/llvm-build-tools/dsymutil',
+        'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC',
+      }
+    ],
+    'condition': 'checkout_mac',
+    'dep_type': 'cipd',
+  },
+  'src/third_party/android_build_tools/aapt2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_build_tools/aapt2',
+              'version': 'STY0BXlZxsEhudnlXQFed-B5UpwehcoM0sYqor6qRqsC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/byte_buddy': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/byte_buddy',
+              'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/byte_buddy/android_sdk_build_tools_25_0_2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools',
+              'version': 'kwIs2vdfTm93yEP8LG5aSnchN4BVEdVxbqQtF4XpPdkC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/ced/src': {
+    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/errorprone/lib': {
+      'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7',
+      'condition': 'checkout_android',
+  },
+  'src/third_party/findbugs': {
+    'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/gson': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/gson',
+              'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/guava': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/guava',
+              'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/hamcrest': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/hamcrest',
+              'version': '37eccfc658fe79695d6abb6dd497463c4372032f',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/icu': {
+    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'e8c3bc9ea97d4423ad0515e5f1c064f486dae8b1',
+  },
+  'src/third_party/icu4j': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/icu4j',
+              'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/intellij': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/intellij',
+              'version': '77c2721b024b36ee073402c08e6d8428c0295336',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/jdk': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/jdk',
+              'version': 'GCFtf5t6M4HlrHj6NXedHbpHp2xjgognF8ptNci4478C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/jsr-305/src': {
+    'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/junit/src': {
+    'url': Var('chromium_git') + '/external/junit.git' + '@' + '05fe2a64f59127c02135be22f416e91260d6ede6',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/libunwindstack': {
+      'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '4dbfa0e8c844c8e243b297bc185e54a99ff94f9e',
+      'condition': 'checkout_android',
+  },
+  'src/third_party/ninja': {
+    'packages': [
+      {
+        'package': 'infra/3pp/tools/ninja/${{platform}}',
+        'version': Var('ninja_version'),
+      }
+    ],
+    'dep_type': 'cipd',
+  },
+  'src/third_party/mockito/src': {
+    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '7c3641bcef717ffa7d765f2c86b847d0aab1aac9',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/objenesis': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/objenesis',
+              'version': 'tknDblENYi8IaJYyD6tUahUyHYZlzJ_Y74_QZSz4DpIC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/ow2_asm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/ow2_asm',
+              'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/r8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/r8',
+              'version': 'O1BBWiBTIeNUcraX8STMtQXVaCleu6SJJjWCcnfhPLkC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  # This duplication is intentional, so we avoid updating the r8.jar used by
+  # dexing unless necessary, since each update invalidates all incremental
+  # dexing and unnecessarily slows down all bots.
+  'src/third_party/r8/d8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/r8',
+              'version': 'vw5kLlW3-suSlCKSO9OQpFWpR8oDnvQ8k1RgKNUapQYC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/proguard': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/proguard',
+              'version': 'Fd91BJFVlmiO6c46YMTsdy7n2f5Sk2hVVGlzPLvqZPsC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/requests/src': {
+    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'c7e0fc087ceeadb8b4c84a0953a422c474093d6d',
+    'condition': 'checkout_android',
+  },
+  'src/third_party/robolectric': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/robolectric',
+              'version': 'hzetqh1qFI32FOgQroZvGcGdomrgVBJ6WKRnl1KFw6EC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/sqlite4java': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/sqlite4java',
+              'version': 'LofjKH9dgXIAJhRYCPQlMFywSwxYimrfDeBmaHc-Z5EC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/turbine': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/turbine',
+              'version': '2I2Nz480QsuCxpQ1lMfbigX8l5HAhX3_ykWU4TKRGo4C',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/third_party/ub-uiautomator/lib': {
+    'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
+    'condition': 'checkout_android',
+  },
+
+  # iOS deps:
+  'src/ios': {
+    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + 'ddd58e86cf4ebdc0db60a5d0f3c323de49bb295c',
+    'condition': 'checkout_ios'
+  },
+
+  # Everything coming after this is automatically updated by the auto-roller.
+  # === ANDROID_DEPS Generated Code Start ===
+  # Generated by //third_party/android_deps/fetch_all.py
+  'src/third_party/android_deps/libs/android_arch_core_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_core_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
+              'version': 'version:2@1.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_collections': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_customview': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_design': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_documentfile': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_interpolator': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_loader': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_multidex': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
+              'version': 'version:2@1.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_print': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_fragment': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_v4': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_transition': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_viewpager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
+              'version': 'version:2@28.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_common',
+              'version': 'version:2@30.2.0-beta01.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api',
+              'version': 'version:2@30.2.0-beta01.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_tools_sdk_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common',
+              'version': 'version:2@30.2.0-beta01.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine',
+              'version': 'version:2@2.8.8.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms',
+              'version': 'version:2@1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_annotations',
+              'version': 'version:2@4.1.1.4.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework',
+              'version': 'version:2@4.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api',
+              'version': 'version:2@2.2.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
+              'version': 'version:2@20.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
+              'version': 'version:2@18.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
+              'version': 'version:2@18.0.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
+              'version': 'version:2@18.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
+              'version': 'version:2@18.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging',
+              'version': 'version:2@16.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
+              'version': 'version:2@18.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
+              'version': 'version:2@19.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
+              'version': 'version:2@18.0.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
+              'version': 'version:2@20.1.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
+              'version': 'version:2@19.1.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_material_material': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material',
+              'version': 'version:2@1.7.0-alpha02.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_play_core_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core_common',
+              'version': 'version:2@2.0.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_play_feature_delivery': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_feature_delivery',
+              'version': 'version:2@2.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_auto_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common',
+              'version': 'version:2@1.2.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_service_auto_service': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service',
+              'version': 'version:2@1.0-rc6.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations',
+              'version': 'version:2@1.0-rc6.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations',
+              'version': 'version:2@1.10.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
+              'version': 'version:2@3.0.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_gson_gson': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson',
+              'version': 'version:2@2.9.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
+              'version': 'version:2@2.30.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
+              'version': 'version:2@2.30.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
+              'version': 'version:2@2.30.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
+              'version': 'version:2@2.30.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation',
+              'version': 'version:2@2.11.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
+              'version': 'version:2@2.18.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api',
+              'version': 'version:2@2.11.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core',
+              'version': 'version:2@2.11.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations',
+              'version': 'version:2@2.11.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_javac': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac',
+              'version': 'version:2@9+181-r4173-1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
+              'version': 'version:2@9-dev-r4023-3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations',
+              'version': 'version:2@16.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common',
+              'version': 'version:2@19.5.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_components': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components',
+              'version': 'version:2@16.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders',
+              'version': 'version:2@16.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json',
+              'version': 'version:2@17.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid',
+              'version': 'version:2@21.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop',
+              'version': 'version:2@17.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations',
+              'version': 'version:2@16.3.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop',
+              'version': 'version:2@16.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector',
+              'version': 'version:2@18.0.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging',
+              'version': 'version:2@21.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
+              'version': 'version:2@1.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_failureaccess': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess',
+              'version': 'version:2@1.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_guava': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
+              'version': 'version:2@31.1-jre.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_guava_android': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android',
+              'version': 'version:2@31.1-android.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_listenablefuture': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture',
+              'version': 'version:2@1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
+              'version': 'version:2@1.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java',
+              'version': 'version:2@3.19.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite',
+              'version': 'version:2@3.21.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils',
+              'version': 'version:2@1.3.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_squareup_javapoet': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
+              'version': 'version:2@1.13.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_squareup_javawriter': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter',
+              'version': 'version:2@2.1.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_squareup_okio_okio_jvm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm',
+              'version': 'version:2@3.3.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm',
+              'version': 'version:2@4.7.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils',
+              'version': 'version:2@4.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_api',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_binder': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_binder',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_context': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_context',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_core',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_grpc_grpc_stub': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_stub',
+              'version': 'version:2@1.49.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/io_perfmark_perfmark_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/io_perfmark_perfmark_api',
+              'version': 'version:2@0.25.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api',
+              'version': 'version:2@1.3.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+              'version': 'version:2@1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+              'version': 'version:2@1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy',
+              'version': 'version:2@1.14.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent',
+              'version': 'version:2@1.14.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap',
+              'version': 'version:2@0.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on',
+              'version': 'version:2@1.72.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup',
+              'version': 'version:2@1.2.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+              'version': 'version:2@2.5.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_qual': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual',
+              'version': 'version:2@3.25.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_util': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_util',
+              'version': 'version:2@3.25.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone',
+              'version': 'version:2@3.15.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+              'version': 'version:2@1.21.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber',
+              'version': 'version:2@2.5.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit',
+              'version': 'version:2@4.4.1.201607150455-r.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_hamcrest_hamcrest': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_hamcrest_hamcrest',
+              'version': 'version:2@2.2.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7',
+              'version': 'version:2@1.8.20.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8',
+              'version': 'version:2@1.8.20.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android',
+              'version': 'version:2@1.6.4.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm',
+              'version': 'version:2@1.6.4.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava',
+              'version': 'version:2@1.6.4.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm',
+              'version': 'version:2@0.1.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_jsoup_jsoup': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_jsoup_jsoup',
+              'version': 'version:2@1.15.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_android': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android',
+              'version': 'version:2@5.4.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core',
+              'version': 'version:2@5.4.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_mockito_mockito_subclass': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass',
+              'version': 'version:2@5.4.0.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_objenesis_objenesis': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_objenesis_objenesis',
+              'version': 'version:2@3.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm',
+              'version': 'version:2@9.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis',
+              'version': 'version:2@9.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons',
+              'version': 'version:2@9.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree',
+              'version': 'version:2@9.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_ow2_asm_asm_util': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util',
+              'version': 'version:2@9.5.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_pcollections_pcollections': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections',
+              'version': 'version:2@3.1.4.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_junit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat',
+              'version': 'version:2@1.0.1.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_pluginapi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_resources': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_robolectric': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_sandbox': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadowapi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadows_framework': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_utils': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_robolectric_utils_reflector': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector',
+              'version': 'version:2@4.10.3.cr1',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  # === ANDROID_DEPS Generated Code End ===
+}
+
+pre_deps_hooks = [
+  {
+    # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
+    # TODO(kjellander): Remove this in March 2017.
+    'name': 'cleanup_links',
+    'pattern': '.',
+    'action': ['python3', 'src/cleanup_links.py'],
+  },
+]
+
+hooks = [
+  {
+    # This clobbers when necessary (based on get_landmines.py). It should be
+    # an early hook but it will need to be run after syncing Chromium and
+    # setting up the links, so the script actually exists.
+    'name': 'landmines',
+    'pattern': '.',
+    'action': [
+        'python3',
+        'src/build/landmines.py',
+        '--landmine-scripts',
+        'src/tools_libyuv/get_landmines.py',
+        '--src-dir',
+        'src',
+    ],
+  },
+  # Downloads the current stable linux sysroot to build/linux/ if needed.
+  {
+    'name': 'sysroot_arm',
+    'pattern': '.',
+    'condition': 'checkout_linux and checkout_arm',
+    'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=arm'],
+  },
+  {
+    'name': 'sysroot_arm64',
+    'pattern': '.',
+    'condition': 'checkout_linux and checkout_arm64',
+    'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=arm64'],
+  },
+  {
+    'name': 'sysroot_x86',
+    'pattern': '.',
+    'condition': 'checkout_linux and (checkout_x86 or checkout_x64)',
+    'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=x86'],
+  },
+  {
+    'name': 'sysroot_mips',
+    'pattern': '.',
+    'condition': 'checkout_linux and checkout_mips',
+    'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=mips'],
+  },
+  {
+    'name': 'sysroot_x64',
+    'pattern': '.',
+    'condition': 'checkout_linux and checkout_x64',
+    'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=x64'],
+  },
+  {
+    # Update the Windows toolchain if necessary.
+    'name': 'win_toolchain',
+    'pattern': '.',
+    'action': ['python3', 'src/build/vs_toolchain.py', 'update'],
+  },
+  {
+    # Update the Mac toolchain if necessary.
+    'name': 'mac_toolchain',
+    'pattern': '.',
+    'action': ['python3', 'src/build/mac_toolchain.py'],
+    'condition': 'checkout_mac',
+  },
+  {
+    'name': 'msan_chained_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_no_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_chained_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'msan_no_origins_focal',
+    'pattern': '.',
+    'condition': 'checkout_instrumented_libraries',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-instrumented-libraries',
+                '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1',
+              ],
+  },
+  {
+    'name': 'Download Fuchsia SDK from GCS',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_sdk.py',
+      '--cipd-prefix={fuchsia_sdk_cipd_prefix}',
+      '--version={fuchsia_version}',
+    ],
+  },
+  {
+    'name': 'Download Fuchsia system images',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia and checkout_fuchsia_product_bundles',
+    'action': [
+      'python3',
+      'src/build/fuchsia/update_product_bundles.py',
+      '{checkout_fuchsia_boot_images}',
+    ],
+  },
+  {
+    # Pull clang if needed or requested via GYP_DEFINES.
+    # Note: On Win, this should run after win_toolchain, as it may use it.
+    'name': 'clang',
+    'pattern': '.',
+    'action': ['python3', 'src/tools/clang/scripts/update.py'],
+  },
+  {
+    # Update LASTCHANGE.
+    'name': 'lastchange',
+    'pattern': '.',
+    'action': ['python3', 'src/build/util/lastchange.py',
+               '-o', 'src/build/util/LASTCHANGE'],
+  },
+  # Pull clang-format binaries using checked-in hashes.
+  {
+    'name': 'clang_format_win',
+    'pattern': '.',
+    'condition': 'host_os == "win"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--platform=win32',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/win/clang-format.exe.sha1',
+    ],
+  },
+  {
+    'name': 'clang_format_mac_x64',
+    'pattern': '.',
+    'condition': 'host_os == "mac" and host_cpu == "x64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--platform=darwin',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/mac/clang-format.x64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
+    ],
+  },
+  {
+    'name': 'clang_format_mac_arm64',
+    'pattern': '.',
+    'condition': 'host_os == "mac" and host_cpu == "arm64"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/mac/clang-format.arm64.sha1',
+                '-o', 'src/buildtools/mac/clang-format',
+     ],
+  },
+  {
+    'name': 'clang_format_linux',
+    'pattern': '.',
+    'condition': 'host_os == "linux"',
+    'action': [ 'python3',
+                'src/third_party/depot_tools/download_from_google_storage.py',
+                '--no_resume',
+                '--platform=linux*',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', 'src/buildtools/linux64/clang-format.sha1',
+    ],
+  },
+  # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
+  {
+    'name': 'luci-go_win',
+    'pattern': '.',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=win32',
+                '--no_auth',
+                '--bucket', 'chromium-luci',
+                '-d', 'src/tools/luci-go/win64',
+    ],
+  },
+  {
+    'name': 'luci-go_mac',
+    'pattern': '.',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=darwin',
+                '--no_auth',
+                '--bucket', 'chromium-luci',
+                '-d', 'src/tools/luci-go/mac64',
+    ],
+  },
+  {
+    'name': 'luci-go_linux',
+    'pattern': '.',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=linux*',
+                '--no_auth',
+                '--bucket', 'chromium-luci',
+                '-d', 'src/tools/luci-go/linux64',
+    ],
+  },
+  {
+    'name': 'Generate component metadata for tests',
+    'pattern': '.',
+    'action': [
+      'vpython3',
+      'src/testing/generate_location_tags.py',
+      '--out',
+      'src/testing/location_tags.json',
+    ],
+  },
+]
+
+recursedeps = []
diff --git a/DIR_METADATA b/DIR_METADATA
new file mode 100644
index 00000000..8bc04f15
--- /dev/null
+++ b/DIR_METADATA
@@ -0,0 +1,3 @@
+monorail {
+  component: "Internals>Images>Codecs"
+}
diff --git a/LICENSE b/LICENSE
index da40b336..c911747a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011, Google Inc. All rights reserved.
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/METADATA b/METADATA
index d97975ca..19d0436e 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,19 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update libyuv
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+
+name: "libyuv"
+description: "libyuv is an open source project that includes YUV scaling and conversion functionality."
 third_party {
   license_type: NOTICE
+  last_upgrade_date {
+    year: 2024
+    month: 1
+    day: 11
+  }
+  identifier {
+    type: "Git"
+    value: "https://chromium.googlesource.com/libyuv/libyuv/"
+    version: "af6ac8265bbd07bcf977526458b60305c4304288"
+  }
 }
diff --git a/OWNERS b/OWNERS
index a607e727..f11a7bfd 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,4 +1,11 @@
-fbarchard@google.com
-phoglund@google.com
-magjed@google.com
-chz@google.com
+mbonadei@chromium.org
+fbarchard@chromium.org
+magjed@chromium.org
+wtc@google.com
+jansson@google.com
+
+per-file *.gn=mbonadei@chromium.org,jansson@google.com
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=mbonadei@chromium.org,jansson@google.com
diff --git a/OWNERS.android b/OWNERS.android
new file mode 100644
index 00000000..7529cb92
--- /dev/null
+++ b/OWNERS.android
@@ -0,0 +1 @@
+include platform/system/core:/janitors/OWNERS
diff --git a/files/PATENTS b/PATENTS
index 64aa5c90..64aa5c90 100644
--- a/files/PATENTS
+++ b/PATENTS
diff --git a/files/PRESUBMIT.py b/PRESUBMIT.py
index 2cf1542f..d3901caf 100755..100644
--- a/files/PRESUBMIT.py
+++ b/PRESUBMIT.py
@@ -6,50 +6,30 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import os
-
-
-def _RunPythonTests(input_api, output_api):
-  def join(*args):
-    return input_api.os_path.join(input_api.PresubmitLocalPath(), *args)
-
-  test_directories = [
-      root for root, _, files in os.walk(join('tools_libyuv'))
-      if any(f.endswith('_test.py') for f in files)
-  ]
-
-  tests = []
-  for directory in test_directories:
-    tests.extend(
-      input_api.canned_checks.GetUnitTestsInDirectory(
-          input_api,
-          output_api,
-          directory,
-          whitelist=[r'.+_test\.py$']))
-  return input_api.RunTests(tests, parallel=True)
-
+# Runs PRESUBMIT.py in py3 mode by git cl presubmit.
+USE_PYTHON3 = True
 
 def _CommonChecks(input_api, output_api):
   """Checks common to both upload and commit."""
   results = []
   results.extend(input_api.canned_checks.RunPylint(input_api, output_api,
-      black_list=(r'^base[\\\/].*\.py$',
-                  r'^build[\\\/].*\.py$',
-                  r'^buildtools[\\\/].*\.py$',
-                  r'^ios[\\\/].*\.py$',
-                  r'^out.*[\\\/].*\.py$',
-                  r'^testing[\\\/].*\.py$',
-                  r'^third_party[\\\/].*\.py$',
-                  r'^tools[\\\/].*\.py$',
-                  # TODO(kjellander): should arguably be checked.
-                  r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
-                  r'^xcodebuild.*[\\\/].*\.py$',),
+      files_to_skip=(r'^base[\\\/].*\.py$',
+                     r'^build[\\\/].*\.py$',
+                     r'^buildtools[\\\/].*\.py$',
+                     r'^ios[\\\/].*\.py$',
+                     r'^out.*[\\\/].*\.py$',
+                     r'^testing[\\\/].*\.py$',
+                     r'^third_party[\\\/].*\.py$',
+                     r'^tools[\\\/].*\.py$',
+                     # TODO(kjellander): should arguably be checked.
+                     r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$',
+                     r'^xcodebuild.*[\\\/].*\.py$',),
       disabled_warnings=['F0401',  # Failed to import x
                          'E0611',  # No package y in x
                          'W0232',  # Class has no __init__ method
                         ],
-      pylintrc='pylintrc'))
-  results.extend(_RunPythonTests(input_api, output_api))
+      pylintrc='pylintrc',
+      version='2.7'))
   return results
 
 
diff --git a/files/README.chromium b/README.chromium
index bddc2023..1389f285 100644
--- a/files/README.chromium
+++ b/README.chromium
@@ -1,8 +1,10 @@
 Name: libyuv
-URL: http://code.google.com/p/libyuv/
-Version: 1732
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: 1883
 License: BSD
 License File: LICENSE
+Shipped: yes
 
 Description:
 libyuv is an open source project that includes YUV conversion and scaling functionality.
+
diff --git a/files/README.md b/README.md
index db70b7f0..95eeb04c 100644
--- a/files/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 * Optimized for SSSE3/AVX2 on x86/x64.
 * Optimized for Neon on Arm.
 * Optimized for MSA on Mips.
+* Optimized for RVV on RISC-V.
 
 ### Development
 
diff --git a/README.version b/README.version
deleted file mode 100644
index 0e74ad15..00000000
--- a/README.version
+++ /dev/null
@@ -1,3 +0,0 @@
-Version: r1732
-BugComponent: 42195
-Owner: lajos
diff --git a/files/build_overrides/build.gni b/build_overrides/build.gni
index 6d8319b9..d9d01d51 100644
--- a/files/build_overrides/build.gni
+++ b/build_overrides/build.gni
@@ -6,9 +6,6 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-# Some non-Chromium builds don't use Chromium's third_party/binutils.
-linux_use_bundled_binutils_override = true
-
 # Variable that can be used to support multiple build scenarios, like having
 # Chromium specific targets in a client project's GN file etc.
 build_with_chromium = false
@@ -16,6 +13,9 @@ build_with_chromium = false
 # Some non-Chromium builds don't support building java targets.
 enable_java_templates = true
 
+# Enables assertions on safety checks in libc++.
+enable_safe_libcxx = true
+
 # Allow using custom suppressions files (currently not used by libyuv).
 asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc"
 lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc"
@@ -44,3 +44,20 @@ if (host_os == "mac") {
              "hermetic toolchain if the minimum OS version is not met.")
   use_system_xcode = _result == 0
 }
+
+declare_args() {
+  # Tracing support requires //third_party/perfetto.
+  enable_base_tracing = false
+  use_perfetto_client_library = false
+
+  # Limits the defined //third_party/android_deps targets to only "buildCompile"
+  # and "buildCompileNoDeps" targets. This is useful for third-party
+  # repositories which do not use JUnit tests. For instance,
+  # limit_android_deps == true removes "gn gen" requirement for
+  # //third_party/robolectric .
+  limit_android_deps = false
+
+  # Allows googletest to pretty-print various absl types.
+  # Defined here rather than in gtest.gni to match chromium.
+  gtest_enable_absl_printers = true
+}
diff --git a/files/build_overrides/gtest.gni b/build_overrides/gtest.gni
index d3c3f68c..d3c3f68c 100644
--- a/files/build_overrides/gtest.gni
+++ b/build_overrides/gtest.gni
diff --git a/build_overrides/partition_alloc.gni b/build_overrides/partition_alloc.gni
new file mode 100644
index 00000000..dcf8ac2d
--- /dev/null
+++ b/build_overrides/partition_alloc.gni
@@ -0,0 +1,17 @@
+# Copyright 2022 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Use default values for PartitionAlloc as standalone library from
+# base/allocator/partition_allocator/build_overrides/partition_alloc.gni
+use_partition_alloc_as_malloc_default = false
+use_allocator_shim_default = false
+enable_backup_ref_ptr_support_default = false
+enable_mte_checked_ptr_support_default = false
+put_ref_count_in_previous_slot_default = false
+enable_backup_ref_ptr_slow_checks_default = false
+enable_dangling_raw_ptr_checks_default = false
diff --git a/files/cleanup_links.py b/cleanup_links.py
index ba290789..7d1eba9b 100755
--- a/files/cleanup_links.py
+++ b/cleanup_links.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env vpython3
+
 # Copyright 2017 The LibYuv Project Authors. All rights reserved.
 #
 # Use of this source code is governed by a BSD-style license
@@ -18,8 +19,8 @@ landing that change, this script cleans up any old symlinks, avoiding annoying
 manual cleanup needed in order to complete gclient sync.
 """
 
+import argparse
 import logging
-import optparse
 import os
 import shelve
 import subprocess
@@ -32,14 +33,14 @@ LINKS_DB = 'links'
 # Version management to make future upgrades/downgrades easier to support.
 SCHEMA_VERSION = 1
 
-class WebRTCLinkSetup(object):
+class WebRTCLinkSetup():
   def __init__(self, links_db, dry_run=False):
     self._dry_run = dry_run
     self._links_db = links_db
 
   def CleanupLinks(self):
     logging.debug('CleanupLinks')
-    for source, link_path  in self._links_db.iteritems():
+    for source, link_path  in self._links_db.tems():
       if source == 'SCHEMA_VERSION':
         continue
       if os.path.islink(link_path) or sys.platform.startswith('win'):
@@ -71,15 +72,15 @@ def _initialize_database(filename):
 
 
 def main():
-  parser = optparse.OptionParser()
-  parser.add_option('-d', '--dry-run', action='store_true', default=False,
-                    help='Print what would be done, but don\'t perform any '
-                         'operations. This will automatically set logging to '
-                         'verbose.')
-  parser.add_option('-v', '--verbose', action='store_const',
-                    const=logging.DEBUG, default=logging.INFO,
-                    help='Print verbose output for debugging.')
-  options, _ = parser.parse_args()
+  p = argparse.ArgumentParser()
+  p.add_argument('-d', '--dry-run', action='store_true', default=False,
+                 help='Print what would be done, but don\'t perform any '
+                      'operations. This will automatically set logging to '
+                      'verbose.')
+  p.add_argument('-v', '--verbose', action='store_const',
+                 const=logging.DEBUG, default=logging.INFO,
+                 help='Print verbose output for debugging.')
+  options = p.parse_args()
 
   if options.dry_run:
     options.verbose = logging.DEBUG
diff --git a/codereview.settings b/codereview.settings
index 9782886f..b226fae5 100644
--- a/codereview.settings
+++ b/codereview.settings
@@ -1,5 +1,5 @@
-# This file is used by git cl to get repository specific information.
+# This file is used by `git cl` to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
 GERRIT_HOST: True
 PROJECT: libyuv
-TRY_ON_UPLOAD: False
 VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
diff --git a/files/docs/deprecated_builds.md b/docs/deprecated_builds.md
index 29e0bf9b..8edefd78 100644
--- a/files/docs/deprecated_builds.md
+++ b/docs/deprecated_builds.md
@@ -165,11 +165,11 @@ mipsel
 
 arm32 disassembly:
 
-    third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+    llvm-objdump -d out/Release/obj/source/libyuv.row_neon.o
 
 arm64 disassembly:
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+    llvm-objdump -d out/Release/obj/source/libyuv.row_neon64.o
 
 Running tests:
 
@@ -239,6 +239,7 @@ If you get a compile error for atlthunk.lib on Windows, read http://www.chromium
     ninja -C out/Debug libyuv_unittest
     ninja -C out/Debug compare
     ninja -C out/Debug yuvconvert
+    ninja -C out/Debug yuvconstants
     ninja -C out/Debug psnr
     ninja -C out/Debug cpuid
 
diff --git a/files/docs/environment_variables.md b/docs/environment_variables.md
index cd8159ad..4eb09659 100644
--- a/files/docs/environment_variables.md
+++ b/docs/environment_variables.md
@@ -22,6 +22,7 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
     LIBYUV_DISABLE_F16C
     LIBYUV_DISABLE_AVX512BW
     LIBYUV_DISABLE_AVX512VL
+    LIBYUV_DISABLE_AVX512VNNI
     LIBYUV_DISABLE_AVX512VBMI
     LIBYUV_DISABLE_AVX512VBMI2
     LIBYUV_DISABLE_AVX512VBITALG
@@ -34,7 +35,13 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
 
 ## MIPS CPUs
     LIBYUV_DISABLE_MSA
-    LIBYUV_DISABLE_MMI
+
+## LOONGARCH CPUs
+    LIBYUV_DISABLE_LSX
+    LIBYUV_DISABLE_LASX
+
+## RISCV CPUs
+    LIBYUV_DISABLE_RVV
 
 # Test Width/Height/Repeat
 
diff --git a/files/docs/filtering.md b/docs/filtering.md
index 8696976e..8696976e 100644
--- a/files/docs/filtering.md
+++ b/docs/filtering.md
diff --git a/files/docs/formats.md b/docs/formats.md
index 97e8ce05..12ea9465 100644
--- a/files/docs/formats.md
+++ b/docs/formats.md
@@ -4,7 +4,9 @@ Formats (FOURCC) supported by libyuv are detailed here.
 
 # Core Formats
 
-There are 2 core formats supported by libyuv - I420 and ARGB.  All YUV formats can be converted to/from I420.  All RGB formats can be converted to/from ARGB.
+There are 2 core formats supported by libyuv - I420 and ARGB.
+  All YUV formats can be converted to/from I420.
+  All RGB formats can be converted to/from ARGB.
 
 Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
 
@@ -36,7 +38,7 @@ This is how OSX formats map to libyuv
 
 The following is extracted from video_common.h as a complete list of formats supported by libyuv.
     enum FourCC {
-      // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+      // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
       FOURCC_I420 = FOURCC('I', '4', '2', '0'),
       FOURCC_I422 = FOURCC('I', '4', '2', '2'),
       FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -46,16 +48,20 @@ The following is extracted from video_common.h as a complete list of formats sup
       FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
       FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
       FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
+      FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020, unofficial fourcc.
+                                                 // 10 bit lsb
 
       // 1 Secondary YUV format: row biplanar.
-      FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+      FOURCC_M420 = FOURCC('M', '4', '2', '0'),  // deprecated.
 
-      // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+      // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc, 2 64 bpp
       FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
       FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
       FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
       FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
       FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
+      FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
+      FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
       FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
       FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
       FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -66,7 +72,7 @@ The following is extracted from video_common.h as a complete list of formats sup
       // 1 Primary Compressed YUV format.
       FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-      // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+      // 11 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
       FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
       FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
       FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -75,6 +81,9 @@ The following is extracted from video_common.h as a complete list of formats sup
       FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
       FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
       FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // unofficial fourcc
+      FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+      FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+      FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
 
       // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
       FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -104,6 +113,27 @@ The following is extracted from video_common.h as a complete list of formats sup
         I444, NV24 and NV42 are full width, full height
         I400 and J400 have no chroma channel.
 
+# Color space
+      The YUV formats start with a letter to specify the color space. e.g. I420
+        I = BT.601 limited range
+        J = BT.601 full range     (J = JPEG that uses this)
+        H = BT.709 limited range  (H for HD)
+        F = BT.709 full range     (F for Full range)
+        U = BT.2020 limited range (U for UHD)
+        V = BT.2020 full range
+        For YUV to RGB conversions, a matrix can be passed.  See also convert_argh.h
+
+# HDR formats
+      Planar formats with 10 or 12 bits use the following fourcc:
+        I010, I012, P010, P012 are half width, half height
+        I210, I212, P210, P212 are half width, full height
+        I410, I412, P410, P412 are full width, full height
+      where
+        I is the color space (see above) and 3 planes: Y, U and V.
+        P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits.  There is a Y plane and a UV plane.
+        0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4
+        10 or 12 is the bits per channel.  The bits are in the low bits of a 16 bit channel.
+
 # The ARGB FOURCC
 
 There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA.  ARGB is most common by far, used for screen formats, and windows webcam drivers.
@@ -152,6 +182,13 @@ The 2 bit alpha has 4 values.  Here are the comparable 8 bit alpha values.
 The 10 bit RGB values range from 0 to 1023.
 XR30 is the same as AR30 but with no alpha channel.
 
+# AB64 and AR64
+
+AB64 is similar to ABGR, with 16 bit (2 bytes) per channel. Each channel stores an unsigned short.
+In memory R is the lowest and A is the highest.
+Each channel has value ranges from 0 to 65535.
+AR64 is similar to ARGB.
+
 # NV12 and NV21
 
 NV12 is a biplanar format with a full sized Y plane followed by a single
@@ -161,3 +198,11 @@ The 12 in NV12 refers to 12 bits per pixel.  NV12 has a half width and half
 height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
+
+# YUY2 and UYVY
+
+YUY2 is a packed YUV format with half width, full height.
+
+YUY2 is YUYV in memory
+UYVY is UYVY in memory
diff --git a/files/docs/getting_started.md b/docs/getting_started.md
index 4426b606..f2f71b8b 100644
--- a/files/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -139,11 +139,11 @@ mips
 
 arm disassembly:
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
+    llvm-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
+    llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
 
-    third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+    llvm-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
 
     Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
 
@@ -165,6 +165,7 @@ Running test with C code:
     ninja -C out/Debug libyuv_unittest
     ninja -C out/Debug compare
     ninja -C out/Debug yuvconvert
+    ninja -C out/Debug yuvconstants
     ninja -C out/Debug psnr
     ninja -C out/Debug cpuid
 
@@ -179,8 +180,8 @@ Running test with C code:
 
 mips
 
-   gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
-   gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+   gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" is_component_build=false use_sysroot=false use_gold=false"
+   gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" is_component_build=false use_sysroot=false use_gold=false"
    ninja -v -C out/Debug libyuv_unittest
    ninja -v -C out/Release libyuv_unittest
 
@@ -190,7 +191,7 @@ mips
 
     make V=1 -f linux.mk
     make V=1 -f linux.mk clean
-    make V=1 -f linux.mk CXX=clang++
+    make V=1 -f linux.mk CXX=clang++ CC=clang
 
 ## Building the library with cmake
 
@@ -219,6 +220,47 @@ Install cmake: http://www.cmake.org/
     make -j4
     make package
 
+## Building RISC-V target with cmake
+
+### Prerequisite: build risc-v clang toolchain and qemu
+
+If you don't have prebuilt clang and riscv64 qemu, run the script to download source and build them.
+
+    ./riscv_script/prepare_toolchain_qemu.sh
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+### Cross-compile for RISC-V target
+    cmake -B out/Release/ -DUNIT_TEST=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+          -DTOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+          -DUSE_RVV=ON .
+    cmake --build out/Release/
+
+#### Customized Compiler Flags
+
+Customized compiler flags are supported by `-DRISCV_COMPILER_FLAGS="xxx"`.
+If `-DRISCV_COMPILER_FLAGS="xxx"` is manually assigned, other compile flags(e.g disable -march=xxx) will not be appended.
+
+Example:
+
+    cmake -B out/Release/ -DUNIT_TEST=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \
+          -DRISCV_COMPILER_FLAGS="-mcpu=sifive-x280" \
+          .
+
+### Run on QEMU
+
+#### Run libyuv_unittest on QEMU
+    cd out/Release/
+    USE_RVV=ON \
+    TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+    QEMU_PREFIX_PATH={QEMU_PREFIX_PATH} \
+    ../../riscv_script/run_qemu.sh libyuv_unittest
+
+
 ## Setup for Arm Cross compile
 
 See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html
diff --git a/files/docs/rotation.md b/docs/rotation.md
index fb84fce5..a08430fd 100644
--- a/files/docs/rotation.md
+++ b/docs/rotation.md
@@ -100,4 +100,8 @@ Inverting can be achieved with almost any libyuv function by passing a negative
 
 I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
 
+# Cropping - Vertical Flip
 
+When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane.
+If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which
+will re-invert the image as the conversion outputs.
diff --git a/files/download_vs_toolchain.py b/download_vs_toolchain.py
index 4b345789..6bc086d6 100644
--- a/files/download_vs_toolchain.py
+++ b/download_vs_toolchain.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python
-#
+#!/usr/bin/env vpython3
+
 # Copyright 2014 The LibYuv Project Authors. All rights reserved.
 #
 # Use of this source code is governed by a BSD-style license
@@ -22,7 +22,7 @@ sys.path.insert(0, os.path.join(checkout_root, 'build'))
 sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
 
 
-import vs_toolchain
+import vs_toolchain  # pylint: disable=wrong-import-position
 
 
 if __name__ == '__main__':
diff --git a/files/.gitignore b/files/.gitignore
deleted file mode 100644
index 711f09e0..00000000
--- a/files/.gitignore
+++ /dev/null
@@ -1,94 +0,0 @@
-*.pyc
-pin-log.txt
-/base
-/build
-/buildtools
-/chromium/.gclient.tmp
-/chromium/.gclient.tmp_entries
-/chromium/.last_sync_chromium
-/chromium/src/
-/google_apis
-/links
-/links.db
-/mojo
-/native_client
-/net
-/out
-/sde-avx-sse-transition-out.txt
-/testing
-/third_party/android_platform
-/third_party/android_tools
-/third_party/appurify-python
-/third_party/asan
-/third_party/ashmem
-/third_party/binutils
-/third_party/BUILD.gn
-/third_party/catapult
-/third_party/drmemory
-/third_party/gflags/src
-/third_party/icu
-/third_party/ijar
-/third_party/instrumented_libraries
-/third_party/jsr-305
-/third_party/junit
-/third_party/libjpeg
-/third_party/libjpeg_turbo
-/third_party/libxml
-/third_party/llvm
-/third_party/llvm-build
-/third_party/lss
-/third_party/mockito
-/third_party/modp_b64
-/third_party/protobuf
-/third_party/requests
-/third_party/robolectric
-/third_party/WebKit
-/third_party/yasm
-/tools/android
-/tools/clang
-/tools/generate_library_loader
-/tools/gn
-/tools/grit
-/tools/gritsettings/README
-/tools/gritsettings/resource_ids
-/tools/gyp
-/tools/isolate_driver.py
-/tools/memory
-/tools/protoc_wrapper
-/tools/python
-/tools/sanitizer_options
-/tools/swarming_client
-/tools/tsan_suppressions
-/tools/valgrind
-/tools/valgrind-libyuv/libyuv_tests.bat
-/tools/valgrind-libyuv/libyuv_tests.py
-/tools/valgrind-libyuv/libyuv_tests.sh
-/tools/valgrind-libyuv/memcheck/OWNERS
-/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
-/tools/valgrind-libyuv/memcheck/suppressions.txt
-/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
-/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
-/tools/valgrind-libyuv/tsan/OWNERS
-/tools/valgrind-libyuv/tsan/PRESUBMIT.py
-/tools/valgrind-libyuv/tsan/suppressions.txt
-/tools/valgrind-libyuv/tsan/suppressions_mac.txt
-/tools/valgrind-libyuv/tsan/suppressions_win32.txt
-/tools/vim
-/tools/win
-
-# Files generated by CMake build
-cmake_install.cmake
-CMakeCache.txt
-CMakeFiles/
-convert
-libgtest.a
-libyuv.a
-libyuv_unittest
-
-# Files generated by winarm.mk build
-libyuv_arm.lib
-source/*.o
-
-# Files generated by perf
-perf.data
-perf.data.old
diff --git a/files/Android.bp b/files/Android.bp
deleted file mode 100644
index 20b8c234..00000000
--- a/files/Android.bp
+++ /dev/null
@@ -1,179 +0,0 @@
-package {
-    default_applicable_licenses: ["external_libyuv_files_license"],
-}
-
-// Added automatically by a large-scale-change
-//
-// large-scale-change included anything that looked like it might be a license
-// text as a license_text. e.g. LICENSE, NOTICE, COPYING etc.
-//
-// Please consider removing redundant or irrelevant files from 'license_text:'.
-// See: http://go/android-license-faq
-license {
-    name: "external_libyuv_files_license",
-    visibility: [":__subpackages__"],
-    license_kinds: [
-        "SPDX-license-identifier-BSD",
-    ],
-    license_text: [
-        "LICENSE",
-        "LICENSE_THIRD_PARTY",
-        "PATENTS",
-    ],
-}
-
-cc_library {
-    name: "libyuv",
-    vendor_available: true,
-    product_available: true,
-    host_supported: true,
-    vndk: {
-        enabled: true,
-    },
-
-    srcs: [
-        "source/compare.cc",
-        "source/compare_common.cc",
-        "source/compare_gcc.cc",
-        "source/compare_neon.cc",
-        "source/compare_neon64.cc",
-        "source/compare_mmi.cc",
-        "source/compare_msa.cc",
-        "source/convert.cc",
-        "source/convert_argb.cc",
-        "source/convert_from.cc",
-        "source/convert_from_argb.cc",
-        "source/convert_to_argb.cc",
-        "source/convert_to_i420.cc",
-        "source/cpu_id.cc",
-        "source/planar_functions.cc",
-        "source/rotate.cc",
-        "source/rotate_any.cc",
-        "source/rotate_argb.cc",
-        "source/rotate_common.cc",
-        "source/rotate_gcc.cc",
-        "source/rotate_mmi.cc",
-        "source/rotate_msa.cc",
-        "source/rotate_neon.cc",
-        "source/rotate_neon64.cc",
-        "source/row_any.cc",
-        "source/row_common.cc",
-        "source/row_gcc.cc",
-        "source/row_mmi.cc",
-        "source/row_msa.cc",
-        "source/row_neon.cc",
-        "source/row_neon64.cc",
-        "source/scale.cc",
-        "source/scale_any.cc",
-        "source/scale_argb.cc",
-        "source/scale_common.cc",
-        "source/scale_gcc.cc",
-        "source/scale_mmi.cc",
-        "source/scale_msa.cc",
-        "source/scale_neon.cc",
-        "source/scale_neon64.cc",
-        "source/video_common.cc",
-        "source/convert_jpeg.cc",
-        "source/mjpeg_decoder.cc",
-        "source/mjpeg_validate.cc",
-    ],
-
-    cflags: [
-        "-Wall",
-        "-Werror",
-        "-Wno-unused-parameter",
-        "-fexceptions",
-        "-DHAVE_JPEG",
-    ],
-
-    arch: {
-        arm: {
-            cflags: ["-mfpu=neon"],
-        },
-    },
-
-    shared_libs: ["libjpeg"],
-
-    export_include_dirs: ["include"],
-
-    apex_available: [
-        "//apex_available:platform",
-        "com.android.media.swcodec",
-    ],
-    min_sdk_version: "29",
-}
-
-// compatibilty static library until all uses of libyuv_static are replaced
-// with libyuv (b/37646797)
-cc_library_static {
-    name: "libyuv_static",
-    vendor_available: true,
-    whole_static_libs: ["libyuv"],
-    apex_available: [
-        "//apex_available:platform",
-        "com.android.media.swcodec",
-    ],
-    min_sdk_version: "29",
-}
-
-cc_test {
-    name: "libyuv_unittest",
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-    cflags: ["-Wall", "-Werror"],
-    srcs: [
-        "unit_test/unit_test.cc",
-        "unit_test/basictypes_test.cc",
-        "unit_test/color_test.cc",
-        "unit_test/compare_test.cc",
-        "unit_test/convert_test.cc",
-        "unit_test/cpu_test.cc",
-        "unit_test/cpu_thread_test.cc",
-        "unit_test/math_test.cc",
-        "unit_test/planar_test.cc",
-        "unit_test/rotate_argb_test.cc",
-        "unit_test/rotate_test.cc",
-        "unit_test/scale_argb_test.cc",
-        "unit_test/scale_test.cc",
-        "unit_test/video_common_test.cc",
-    ],
-}
-
-cc_test {
-    name: "compare",
-    gtest: false,
-    srcs: [
-        "util/compare.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "cpuid",
-    gtest: false,
-    srcs: [
-        "util/cpuid.c",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "psnr",
-    gtest: false,
-    srcs: [
-        "util/psnr_main.cc",
-        "util/psnr.cc",
-        "util/ssim.cc",
-    ],
-    static_libs: ["libyuv"],
-}
-
-cc_test {
-    name: "yuvconvert",
-    gtest: false,
-    srcs: [
-        "util/yuvconvert.cc",
-    ],
-    static_libs: ["libyuv"],
-    shared_libs: ["libjpeg"],
-}
diff --git a/files/DEPS b/files/DEPS
deleted file mode 100644
index c5f81b86..00000000
--- a/files/DEPS
+++ /dev/null
@@ -1,1096 +0,0 @@
-vars = {
-  'chromium_git': 'https://chromium.googlesource.com',
-  'chromium_revision': '4476bd69d1c8e4e1cde8633d3b33c992f7d3a6d0',
-  'swarming_revision': '0e3e1c4dc4e79f25a5b58fcbc135dc93183c0c54',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling lss
-  # and whatever else without interference from each other.
-  'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling catapult
-  # and whatever else without interference from each other.
-  'catapult_revision': 'a24a725f7834c16b3628bfb63f349b3480bf9592',
-  # the commit queue can handle CLs rolling android_sdk_build-tools_version
-  # and whatever else without interference from each other.
-  'android_sdk_build-tools_version': 'DLK621q5_Bga5EsOr7cp6bHWWxFKx6UHLu_Ix_m3AckC',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_emulator_version
-  # and whatever else without interference from each other.
-  'android_sdk_emulator_version': 'ki7EDQRAiZAUYlnTWR1XmI6cJTk65fJ-DNZUU1zrtS8C',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_extras_version
-  # and whatever else without interference from each other.
-  'android_sdk_extras_version': 'iIwhhDox5E-mHgwUhCz8JACWQCpUjdqt5KTY9VLugKQC',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_patcher_version
-  # and whatever else without interference from each other.
-  'android_sdk_patcher_version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_platform-tools_version
-  # and whatever else without interference from each other.
-  'android_sdk_platform-tools_version': '4Y2Cb2LGzoc-qt-oIUIlhySotJaKeE3ELFedSVe6Uk8C',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_platforms_version
-  # and whatever else without interference from each other.
-  'android_sdk_platforms_version': 'Kg2t9p0YnQk8bldUv4VA3o156uPXLUfIFAmVZ-Gm5ewC',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_sources_version
-  # and whatever else without interference from each other.
-  'android_sdk_sources_version': 'K9uEn3JvNELEVjjVK_GQD3ZQD3rqAnJSxCWxjmUmRkgC',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_tools_version
-  # and whatever else without interference from each other.
-  'android_sdk_tools_version': 'wYcRQC2WHsw2dKWs4EA7fw9Qsyzu1ds1_fRjKmGxe5QC',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling android_sdk_tools-lint_version
-  # and whatever else without interference from each other.
-  'android_sdk_tools-lint_version': '89hXqZYzCum3delB5RV7J_QyWkaRodqdtQS0s3LMh3wC',
-}
-
-deps = {
-  'src/build':
-    Var('chromium_git') + '/chromium/src/build' + '@' + '669e41d6f18842ed5740449662a71b715dc607c6',
-  'src/buildtools':
-    Var('chromium_git') + '/chromium/buildtools.git' + '@' + '0e1cbc4eab6861b0c84bf2ed9a3c4b7aa2063819',
-  'src/testing':
-    Var('chromium_git') + '/chromium/src/testing' + '@' + 'b1c6aeebeabcc177a83ff0a33dc6c3ab03d4aa94',
-  'src/third_party':
-    Var('chromium_git') + '/chromium/src/third_party' + '@' + 'be3e0fc18f2e9ea14d0e9369e539eae5986335fd',
-  'src/third_party/catapult':
-    Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
-  'src/third_party/colorama/src':
-    Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
-  'src/third_party/freetype/src':
-    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'd01e28f41f8810c8ea422b854f8722659589fa99',
-  'src/third_party/googletest/src':
-    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '879ac092fde0a19e1b3a61b2546b2a422b1528bc',
-  'src/third_party/harfbuzz-ng/src':
-    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '26c5b54fb09fb45e02c9c4618bcea4958c698953',
-  'src/third_party/libjpeg_turbo':
-    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '61a2bbaa9aec89cb2c882d87ace6aba9aee49bb9',
-  'src/third_party/yasm/source/patched-yasm':
-    Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
-  'src/tools':
-    Var('chromium_git') + '/chromium/src/tools' + '@' + '419541c8352b3b75a99c9a5a7c0d1e7b92f3fcf7',
-  'src/tools/swarming_client':
-    Var('chromium_git') + '/infra/luci/client-py.git' + '@' +  Var('swarming_revision'),
-
-  # libyuv-only dependencies (not present in Chromium).
-  'src/third_party/gflags':
-    Var('chromium_git') + '/external/webrtc/deps/third_party/gflags' + '@' + '892576179b45861b53e04a112996a738309cf364',
-  'src/third_party/gflags/src':
-    Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca',
-  'src/third_party/gtest-parallel':
-    Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
-
-  'src/third_party/lss': {
-    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
-    'condition': 'checkout_android or checkout_linux',
-  },
-
-  # Android deps:
-  'src/third_party/accessibility_test_framework': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/accessibility-test-framework',
-              'version': 'version:2.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/auto/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
-    'condition': 'checkout_android',
-  },
-  'src/base': {
-    'url': Var('chromium_git') + '/chromium/src/base' + '@' + '162a5d66ad148f26bbbe6b6ecaf5c1bafa2173e6',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/bazel': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/bazel',
-              'version': 'version:0.10.0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/bouncycastle': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/bouncycastle',
-              'version': 'version:1.46-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_ndk': {
-    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '4e2cea441bfd43f0863d14f57b1e1844260b9884',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/android_support_test_runner': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_support_test_runner',
-              'version': 'version:0.5-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_tools': {
-    'url': Var('chromium_git') + '/android_tools.git' + '@' + 'e958d6ea74442d4e0849bb8a018d215a0e78981d',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/android_sdk/public': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_sdk/public/build-tools',
-              'version': Var('android_sdk_build-tools_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/emulator',
-              'version': Var('android_sdk_emulator_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/extras',
-              'version': Var('android_sdk_extras_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/patcher',
-              'version': Var('android_sdk_patcher_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/platform-tools',
-              'version': Var('android_sdk_platform-tools_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/platforms',
-              'version': Var('android_sdk_platforms_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/sources',
-              'version': Var('android_sdk_sources_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/tools',
-              'version': Var('android_sdk_tools_version'),
-          },
-          {
-              'package': 'chromium/third_party/android_sdk/public/tools-lint',
-              'version': Var('android_sdk_tools-lint_version'),
-          },
-      ],
-      'condition': 'checkout_android_native_support',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/android_build_tools/aapt2': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_tools_aapt2',
-              'version': 'version:3.2.0-alpha18-4804415-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/byte_buddy': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/byte_buddy',
-              'version': 'version:1.4.17-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/ced/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/errorprone/lib': {
-      'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7',
-      'condition': 'checkout_android',
-  },
-  'src/third_party/findbugs': {
-    'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/gson': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/gson',
-              'version': 'version:2.8.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/guava': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/guava',
-              'version': 'version:23.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/hamcrest': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/hamcrest',
-              'version': 'version:1.3-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/icu': {
-    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd65301491c513d49163ad29c853eb85c02c8d5b4',
-  },
-  'src/third_party/icu4j': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/icu4j',
-              'version': 'version:53.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/intellij': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/intellij',
-              'version': 'version:12.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/jsr-305/src': {
-    'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/junit/src': {
-    'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/mockito/src': {
-    'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/objenesis': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/objenesis',
-              'version': 'version:2.4-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/ow2_asm': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/ow2_asm',
-              'version': 'version:5.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/r8': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/r8',
-              'version': 'version:1.0.30',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/proguard': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/proguard',
-              'version': '3bd778c422ea5496de2ef25c007a517dbb5ce5ca',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/requests/src': {
-    'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/robolectric': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/robolectric',
-              'version': 'version:3.5.1',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/robolectric/robolectric': {
-    'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/sqlite4java': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/sqlite4java',
-              'version': 'version:0.282-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-  'src/third_party/ub-uiautomator/lib': {
-    'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
-    'condition': 'checkout_android',
-  },
-  'src/third_party/xstream': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/xstream',
-              'version': 'version:1.4.8-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  # iOS deps:
-  'src/ios': {
-    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '44be3c093cf2db7ab4cf1997d6a1a07722f1f391',
-    'condition': 'checkout_ios'
-  },
-
-  # Win deps:
-  # Dependencies used by libjpeg-turbo
-  'src/third_party/yasm/binaries': {
-    'url': Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
-    'condition': 'checkout_win',
-  },
-
-  # === ANDROID_DEPS Generated Code Start ===
-  # Generated by //tools/android/roll/android_deps/fetch_all.sh
-  'src/third_party/android_deps/libs/android_arch_core_common': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
-              'version': 'version:1.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
-              'version': 'version:1.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
-              'version': 'version:1.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_design': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_gridlayout_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_leanback_v17': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_multidex': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
-              'version': 'version:1.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_palette_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_preference_leanback_v17': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_preference_v14': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_preference_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_annotations': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_compat': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_fragment': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_v13': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_v4': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_android_support_transition': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
-              'version': 'version:27.0.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
-              'version': 'version:12.0.1-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_google_android_play_core': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
-              'version': 'version:1.3.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  'src/third_party/android_deps/libs/com_squareup_javapoet': {
-      'packages': [
-          {
-              'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
-              'version': 'version:1.11.0-cr0',
-          },
-      ],
-      'condition': 'checkout_android',
-      'dep_type': 'cipd',
-  },
-
-  # === ANDROID_DEPS Generated Code End ===
-}
-
-# Define rules for which include paths are allowed in our source.
-include_rules = [ '+gflags' ]
-
-pre_deps_hooks = [
-  {
-    # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8.
-    # TODO(kjellander): Remove this in March 2017.
-    'name': 'cleanup_links',
-    'pattern': '.',
-    'action': ['python', 'src/cleanup_links.py'],
-  },
-]
-
-hooks = [
-  {
-    # This clobbers when necessary (based on get_landmines.py). It should be
-    # an early hook but it will need to be run after syncing Chromium and
-    # setting up the links, so the script actually exists.
-    'name': 'landmines',
-    'pattern': '.',
-    'action': [
-        'python',
-        'src/build/landmines.py',
-        '--landmine-scripts',
-        'src/tools_libyuv/get_landmines.py',
-        '--src-dir',
-        'src',
-    ],
-  },
-  # Downloads the current stable linux sysroot to build/linux/ if needed.
-  {
-    'name': 'sysroot_arm',
-    'pattern': '.',
-    'condition': 'checkout_linux and checkout_arm',
-    'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
-               '--arch=arm'],
-  },
-  {
-    'name': 'sysroot_arm64',
-    'pattern': '.',
-    'condition': 'checkout_linux and checkout_arm64',
-    'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
-               '--arch=arm64'],
-  },
-  {
-    'name': 'sysroot_x86',
-    'pattern': '.',
-    'condition': 'checkout_linux and (checkout_x86 or checkout_x64)',
-    'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
-               '--arch=x86'],
-  },
-  {
-    'name': 'sysroot_mips',
-    'pattern': '.',
-    'condition': 'checkout_linux and checkout_mips',
-    'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
-               '--arch=mips'],
-  },
-  {
-    'name': 'sysroot_x64',
-    'pattern': '.',
-    'condition': 'checkout_linux and checkout_x64',
-    'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
-               '--arch=x64'],
-  },
-  {
-    # Update the Windows toolchain if necessary.
-    'name': 'win_toolchain',
-    'pattern': '.',
-    'action': ['python', 'src/build/vs_toolchain.py', 'update'],
-  },
-  {
-    # Update the Mac toolchain if necessary.
-    'name': 'mac_toolchain',
-    'pattern': '.',
-    'action': ['python', 'src/build/mac_toolchain.py'],
-  },
-  # Pull binutils for linux, enabled debug fission for faster linking /
-  # debugging when used with clang on Ubuntu Precise.
-  # https://code.google.com/p/chromium/issues/detail?id=352046
-  {
-    'name': 'binutils',
-    'pattern': 'src/third_party/binutils',
-    'action': [
-        'python',
-        'src/third_party/binutils/download.py',
-    ],
-  },
-  {
-    # Pull clang if needed or requested via GYP_DEFINES.
-    # Note: On Win, this should run after win_toolchain, as it may use it.
-    'name': 'clang',
-    'pattern': '.',
-    'action': ['python', 'src/tools/clang/scripts/update.py'],
-  },
-  {
-    # Update LASTCHANGE.
-    'name': 'lastchange',
-    'pattern': '.',
-    'action': ['python', 'src/build/util/lastchange.py',
-               '-o', 'src/build/util/LASTCHANGE'],
-  },
-  # Pull GN binaries.
-  {
-    'name': 'gn_win',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=win32',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/win/gn.exe.sha1',
-    ],
-  },
-  {
-    'name': 'gn_mac',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=darwin',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/mac/gn.sha1',
-    ],
-  },
-  {
-    'name': 'gn_linux64',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=linux*',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/linux64/gn.sha1',
-    ],
-  },
-  # Pull clang-format binaries using checked-in hashes.
-  {
-    'name': 'clang_format_win',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=win32',
-                '--no_auth',
-                '--bucket', 'chromium-clang-format',
-                '-s', 'src/buildtools/win/clang-format.exe.sha1',
-    ],
-  },
-  {
-    'name': 'clang_format_mac',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=darwin',
-                '--no_auth',
-                '--bucket', 'chromium-clang-format',
-                '-s', 'src/buildtools/mac/clang-format.sha1',
-    ],
-  },
-  {
-    'name': 'clang_format_linux',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=linux*',
-                '--no_auth',
-                '--bucket', 'chromium-clang-format',
-                '-s', 'src/buildtools/linux64/clang-format.sha1',
-    ],
-  },
-  # Pull luci-go binaries (isolate, swarming) using checked-in hashes.
-  {
-    'name': 'luci-go_win',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=win32',
-                '--no_auth',
-                '--bucket', 'chromium-luci',
-                '-d', 'src/tools/luci-go/win64',
-    ],
-  },
-  {
-    'name': 'luci-go_mac',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=darwin',
-                '--no_auth',
-                '--bucket', 'chromium-luci',
-                '-d', 'src/tools/luci-go/mac64',
-    ],
-  },
-  {
-    'name': 'luci-go_linux',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=linux*',
-                '--no_auth',
-                '--bucket', 'chromium-luci',
-                '-d', 'src/tools/luci-go/linux64',
-    ],
-  },
-  {
-    # We used to use src as a CIPD root. We moved it to a different directory
-    # in crrev.com/c/930178 but left the clobber here to ensure that that CL
-    # could be reverted safely. This can be safely removed once crbug.com/794764
-    # is resolved.
-    'name': 'Android Clobber Deprecated CIPD Root',
-    'pattern': '.',
-    'condition': 'checkout_android',
-    'action': ['src/build/cipd/clobber_cipd_root.py',
-               '--root', 'src',
-    ],
-  },
-  # Android dependencies. Many are downloaded using Google Storage these days.
-  # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
-  # such dependencies we share with Chromium.
-  {
-    # This downloads SDK extras and puts them in the
-    # third_party/android_sdk/public/extras directory.
-    'name': 'sdkextras',
-    'condition': 'checkout_android',
-    'pattern': '.',
-    'action': ['vpython',
-               'src/build/android/play_services/update.py',
-               'download'
-    ],
-  },
-]
-
-recursedeps = [
-  # buildtools provides clang_format, libc++, and libc++abi.
-  'src/buildtools',
-]
diff --git a/files/LICENSE b/files/LICENSE
deleted file mode 100644
index c911747a..00000000
--- a/files/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/LICENSE_THIRD_PARTY b/files/LICENSE_THIRD_PARTY
deleted file mode 100644
index a71591e7..00000000
--- a/files/LICENSE_THIRD_PARTY
+++ /dev/null
@@ -1,8 +0,0 @@
-This source tree contains third party source code which is governed by third
-party licenses. This file contains references to files which are under other
-licenses than the one provided in the LICENSE file in the root of the source
-tree.
-
-Files governed by third party licenses:
-source/x86inc.asm
-
diff --git a/files/all.gyp b/files/all.gyp
deleted file mode 100644
index 88a74842..00000000
--- a/files/all.gyp
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2013 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# all.gyp and All target are for benefit of android gyp build.
-{
-  'targets': [
-    {
-      'target_name': 'All',
-      'type': 'none',
-      'dependencies': [
-        'libyuv.gyp:*',
-        'libyuv_test.gyp:*',
-      ],
-    },
-  ],
-}
diff --git a/files/chromium/.gclient b/files/chromium/.gclient
deleted file mode 100644
index c1a86ecf..00000000
--- a/files/chromium/.gclient
+++ /dev/null
@@ -1,20 +0,0 @@
-solutions = [{
-  'name': 'src',
-  'url': 'https://chromium.googlesource.com/chromium/src.git',
-  'deps_file': '.DEPS.git',
-  'managed': False,
-  'custom_deps': {
-    # Skip syncing some large dependencies Libyuv will never need.
-    'src/third_party/cld_2/src': None,
-    'src/third_party/ffmpeg': None,
-    'src/third_party/hunspell_dictionaries': None,
-    'src/third_party/liblouis/src': None,
-    'src/third_party/pdfium': None,
-    'src/third_party/skia': None,
-    'src/third_party/trace-viewer': None,
-    'src/third_party/webrtc': None,
-  },
-  'safesync_url': ''
-}]
-
-cache_dir = None
diff --git a/files/chromium/README b/files/chromium/README
deleted file mode 100644
index 127f4b52..00000000
--- a/files/chromium/README
+++ /dev/null
@@ -1,5 +0,0 @@
-This .gclient file is used to do download a copy of Chromium.
-Libyuv uses the Chromium build toolchain and a number of shared
-dependencies by creating symlinks to folders in this checkout,
-using the ../setup_links.py script.
-
diff --git a/files/gyp_libyuv b/files/gyp_libyuv
deleted file mode 100755
index 445b924f..00000000
--- a/files/gyp_libyuv
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This script is used to run GYP for libyuv. It contains selected parts of the
-# main function from the src/build/gyp_chromium file.
-
-import glob
-import os
-import shlex
-import sys
-
-checkout_root = os.path.dirname(os.path.realpath(__file__))
-
-sys.path.insert(0, os.path.join(checkout_root, 'build'))
-import gyp_chromium
-import gyp_helper
-import vs_toolchain
-
-sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
-import gyp
-
-def GetSupplementalFiles():
-  """Returns a list of the supplemental files that are included in all GYP
-  sources."""
-  # Can't use the one in gyp_chromium since the directory location of the root
-  # is different.
-  return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
-
-
-if __name__ == '__main__':
-  args = sys.argv[1:]
-
-  if int(os.environ.get('GYP_CHROMIUM_NO_ACTION', 0)):
-    print 'Skipping gyp_libyuv due to GYP_CHROMIUM_NO_ACTION env var.'
-    sys.exit(0)
-
-  # This could give false positives since it doesn't actually do real option
-  # parsing.  Oh well.
-  gyp_file_specified = False
-  for arg in args:
-    if arg.endswith('.gyp'):
-      gyp_file_specified = True
-      break
-
-  # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
-  if not gyp_file_specified:
-    # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
-    # work, but chdir'ing and adding the relative path does. Spooky :/
-    os.chdir(checkout_root)
-    args.append('all.gyp')
-
-  # There shouldn't be a circular dependency relationship between .gyp files,
-  args.append('--no-circular-check')
-
-  # Default to ninja unless GYP_GENERATORS is set.
-  if not os.environ.get('GYP_GENERATORS'):
-    os.environ['GYP_GENERATORS'] = 'ninja'
-
-  vs2013_runtime_dll_dirs = None
-  if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
-    vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
-
-  # Enforce gyp syntax checking. This adds about 20% execution time.
-  args.append('--check')
-
-  supplemental_includes = gyp_chromium.GetSupplementalFiles()
-  gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
-
-  # Automatically turn on crosscompile support for platforms that need it.
-  if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
-          gyp_vars_dict.get('OS') in ['android', 'ios'],
-          'GYP_CROSSCOMPILE' not in os.environ)):
-    os.environ['GYP_CROSSCOMPILE'] = '1'
-
-  args.extend(['-I' + i for i in
-               gyp_chromium.additional_include_files(supplemental_includes,
-                                                     args)])
-
-  # Set the gyp depth variable to the root of the checkout.
-  args.append('--depth=' + os.path.relpath(checkout_root))
-
-  print 'Updating projects from gyp files...'
-  sys.stdout.flush()
-
-  # Off we go...
-  gyp_rc = gyp.main(args)
-
-  if vs2013_runtime_dll_dirs:
-    x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
-    vs_toolchain.CopyVsRuntimeDlls(
-        os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
-        (x86_runtime, x64_runtime))
-
-  sys.exit(gyp_rc)
diff --git a/files/gyp_libyuv.py b/files/gyp_libyuv.py
deleted file mode 100644
index bb32ec39..00000000
--- a/files/gyp_libyuv.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-# This script is a modified copy of the src/build/gyp_chromium.py file.
-# It is needed for parallel processing.
-
-# This file is (possibly, depending on python version) imported by
-# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
-# through the multiprocessing library.
-
-# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
-# imports that don't end in .py (and aren't directories with an
-# __init__.py). This wrapper makes "import gyp_libyuv" work with
-# those old versions and makes it possible to execute gyp_libyuv.py
-# directly on Windows where the extension is useful.
-
-import os
-
-path = os.path.abspath(os.path.split(__file__)[0])
-execfile(os.path.join(path, 'gyp_libyuv'))
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
deleted file mode 100644
index f571142f..00000000
--- a/files/include/libyuv/convert.h
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
-#define INCLUDE_LIBYUV_CONVERT_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
-
-// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
-#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
-#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
-#include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert I444 to I420.
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I444 to NV21.
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert I422 to I420.
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I422 to NV21.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Copy I420 to I420.
-#define I420ToI420 I420Copy
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Copy I010 to I010
-#define I010ToI010 I010Copy
-#define H010ToH010 I010Copy
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
-             int src_stride_y,
-             const uint16_t* src_u,
-             int src_stride_u,
-             const uint16_t* src_v,
-             int src_stride_v,
-             uint16_t* dst_y,
-             int dst_stride_y,
-             uint16_t* dst_u,
-             int dst_stride_u,
-             uint16_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// Convert 10 bit YUV to 8 bit
-#define H010ToH420 I010ToI420
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I400 (grey) to I420.
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert I400 (grey) to NV21.
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-#define J400ToJ420 I400ToI420
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert NV21 to I420.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height);
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height);
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height);
-
-// ARGB little endian (bgra in memory) to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// BGRA little endian (argb in memory) to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGB little endian (bgr in memory) to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height);
-
-// RGB little endian (bgr in memory) to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height);
-
-// RGB big endian (rgb in memory) to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height);
-
-// RGB16 (RGBP fourcc) little endian to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height);
-
-// RGB15 (RGBO fourcc) little endian to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height);
-
-// RGB12 (R444 fourcc) little endian to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height);
-
-// RGB little endian (bgr in memory) to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_yj,
-                int dst_stride_yj,
-                int width,
-                int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture.
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToI420(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// JPEG to NV21
-LIBYUV_API
-int MJPGToNV21(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-
-// Query size of MJPG in pixels.
-LIBYUV_API
-int MJPGSize(const uint8_t* sample,
-             size_t sample_size,
-             int* width,
-             int* height);
-#endif
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_y" number of bytes in a row of the dst_y plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToI420(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_y,
-                  int dst_stride_y,
-                  uint8_t* dst_u,
-                  int dst_stride_u,
-                  uint8_t* dst_v,
-                  int dst_stride_v,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
deleted file mode 100644
index e8ed1f59..00000000
--- a/files/include/libyuv/convert_argb.h
+++ /dev/null
@@ -1,721 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h"  // For enum RotationMode.
-
-// TODO(fbarchard): This set of functions should exactly match convert.h
-// TODO(fbarchard): Add tests. Create random content of right size and convert
-// with C vs Opt and or to I420 and compare.
-// TODO(fbarchard): Some of these functions lack parameter setting.
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Alias.
-#define ARGBToARGB ARGBCopy
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height);
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Duplicate prototype for function in convert_from.h for remoting.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate);
-
-// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J400 (jpeg grey) to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Alias.
-#define YToARGB I400ToARGB
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert NV12 to ABGR.
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_uv,
-                int src_stride_uv,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-// Convert NV21 to YUV24.
-LIBYUV_API
-int NV21ToYUV24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_yuv24,
-                int dst_stride_yuv24,
-                int width,
-                int height);
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_vu,
-              int src_stride_vu,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-// BGRA little endian (argb in memory) to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Deprecated function name.
-#define BG24ToARGB RGB24ToARGB
-
-// RGB little endian (bgr in memory) to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height);
-
-// RGB big endian (rgb in memory) to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height);
-
-// RGB16 (RGBP fourcc) little endian to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height);
-
-// RGB15 (RGBO fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height);
-
-// RGB12 (R444 fourcc) little endian to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height);
-
-// Aliases
-#define AB30ToARGB AR30ToABGR
-#define AB30ToABGR AR30ToARGB
-#define AB30ToAR30 AR30ToAB30
-
-// Convert AR30 To ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height);
-
-// Convert AR30 To ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
-
-// Convert AR30 To AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height);
-
-#ifdef HAVE_JPEG
-// src_width/height provided by capture
-// dst_width/height for clipping determine final size.
-LIBYUV_API
-int MJPGToARGB(const uint8_t* sample,
-               size_t sample_size,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int src_width,
-               int src_height,
-               int dst_width,
-               int dst_height);
-#endif
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height);
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_abgr,
-                     int dst_stride_abgr,
-                     int width,
-                     int height);
-
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "sample_size" is needed to parse MJPG.
-// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToARGB(const uint8_t* sample,
-                  size_t sample_size,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
deleted file mode 100644
index c64e0216..00000000
--- a/files/include/libyuv/rotate.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
-#define INCLUDE_LIBYUV_ROTATE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported rotation.
-typedef enum RotationMode {
-  kRotate0 = 0,      // No rotation.
-  kRotate90 = 90,    // Rotate 90 degrees clockwise.
-  kRotate180 = 180,  // Rotate 180 degrees.
-  kRotate270 = 270,  // Rotate 270 degrees clockwise.
-
-  // Deprecated.
-  kRotateNone = 0,
-  kRotateClockwise = 90,
-  kRotateCounterClockwise = 270,
-} RotationModeEnum;
-
-// Rotate I420 frame.
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode);
-
-// Rotate I444 frame.
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode);
-
-// Rotate NV12 input and store in I420.
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     enum RotationMode mode);
-
-// Rotate a plane by 0, 90, 180, or 270.
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst,
-                int dst_stride,
-                int width,
-                int height,
-                enum RotationMode mode);
-
-// Rotate planes by 90, 180, 270. Deprecated.
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
-                   int src_stride,
-                   uint8_t* dst,
-                   int dst_stride,
-                   int width,
-                   int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst_a,
-                int dst_stride_a,
-                uint8_t* dst_b,
-                int dst_stride_b,
-                int width,
-                int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-// Deprecated.
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
deleted file mode 100644
index 23ba1634..00000000
--- a/files/include/libyuv/scale.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_H_
-#define INCLUDE_LIBYUV_SCALE_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Supported filtering.
-typedef enum FilterMode {
-  kFilterNone = 0,      // Point sample; Fastest.
-  kFilterLinear = 1,    // Filter horizontally only.
-  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3        // Highest quality.
-} FilterModeEnum;
-
-// Scale a YUV plane.
-LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering);
-
-LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering);
-
-// Scales a YUV 4:2:0 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I420Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-LIBYUV_API
-int I420Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-// Scales a YUV 4:4:4 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I444Scale(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-LIBYUV_API
-int I444Scale_16(const uint16_t* src_y,
-                 int src_stride_y,
-                 const uint16_t* src_u,
-                 int src_stride_u,
-                 const uint16_t* src_v,
-                 int src_stride_v,
-                 int src_width,
-                 int src_height,
-                 uint16_t* dst_y,
-                 int dst_stride_y,
-                 uint16_t* dst_u,
-                 int dst_stride_u,
-                 uint16_t* dst_v,
-                 int dst_stride_v,
-                 int dst_width,
-                 int dst_height,
-                 enum FilterMode filtering);
-
-#ifdef __cplusplus
-// Legacy API.  Deprecated.
-LIBYUV_API
-int Scale(const uint8_t* src_y,
-          const uint8_t* src_u,
-          const uint8_t* src_v,
-          int src_stride_y,
-          int src_stride_u,
-          int src_stride_v,
-          int src_width,
-          int src_height,
-          uint8_t* dst_y,
-          uint8_t* dst_u,
-          uint8_t* dst_v,
-          int dst_stride_y,
-          int dst_stride_u,
-          int dst_stride_v,
-          int dst_width,
-          int dst_height,
-          LIBYUV_BOOL interpolate);
-
-// For testing, allow disabling of specialized scalers.
-LIBYUV_API
-void SetUseReferenceImpl(LIBYUV_BOOL use);
-#endif  // __cplusplus
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/files/infra/config/PRESUBMIT.py b/files/infra/config/PRESUBMIT.py
deleted file mode 100644
index 89eaa519..00000000
--- a/files/infra/config/PRESUBMIT.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2018 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-def CheckChangeOnUpload(input_api, output_api):
-  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
-
-
-def CheckChangeOnCommit(input_api, output_api):
-  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/files/infra/config/README.md b/files/infra/config/README.md
deleted file mode 100644
index c036d610..00000000
--- a/files/infra/config/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory contains configuration files for infra services.
diff --git a/files/infra/config/cq.cfg b/files/infra/config/cq.cfg
deleted file mode 100644
index 7bcc0595..00000000
--- a/files/infra/config/cq.cfg
+++ /dev/null
@@ -1,51 +0,0 @@
-# Commit Queue configuration file. The documentation of the format can be found
-# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
-
-version: 1
-cq_status_url: "https://chromium-cq-status.appspot.com"
-git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
-
-gerrit {}
-
-verifiers {
-  gerrit_cq_ability {
-    committer_list: "project-libyuv-committers"
-    dry_run_access_list: "project-libyuv-tryjob-access"
-  }
-
-  try_job {
-    buckets {
-      name: "luci.libyuv.try"
-      builders { name: "win" }
-      builders { name: "win_rel" }
-      builders { name: "win_x64_rel" }
-      builders { name: "win_clang" }
-      builders { name: "win_clang_rel" }
-      builders { name: "win_x64_clang_rel" }
-      builders { name: "mac" }
-      builders { name: "mac_rel" }
-      builders { name: "mac_asan" }
-      builders { name: "ios" }
-      builders { name: "ios_rel" }
-      builders { name: "ios_arm64" }
-      builders { name: "ios_arm64_rel" }
-      builders { name: "linux" }
-      builders { name: "linux_rel" }
-      builders {
-        name: "linux_gcc"
-        experiment_percentage: 100
-      }
-      builders { name: "linux_memcheck" }
-      builders { name: "linux_tsan2" }
-      builders { name: "linux_asan" }
-      builders { name: "linux_msan" }
-      builders { name: "linux_ubsan" }
-      builders { name: "linux_ubsan_vptr" }
-      builders { name: "android" }
-      builders { name: "android_rel" }
-      builders { name: "android_arm64" }
-      builders { name: "android_x86" }
-      builders { name: "android_x64" }
-    }
-  }
-}
diff --git a/files/libyuv_nacl.gyp b/files/libyuv_nacl.gyp
deleted file mode 100644
index b8fe57ee..00000000
--- a/files/libyuv_nacl.gyp
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-{
-  'includes': [
-    'libyuv.gypi',
-    '../../native_client/build/untrusted.gypi',
-  ],
-  'targets': [
-    {
-      'target_name': 'libyuv_nacl',
-      'type': 'none',
-      'variables': {
-        'nlib_target': 'libyuv_nacl.a',
-        'build_glibc': 0,
-        'build_newlib': 0,
-        'build_pnacl_newlib': 1,
-      },
-      'include_dirs': [
-        'include',
-      ],
-      'direct_dependent_settings': {
-        'include_dirs': [
-          'include',
-        ],
-      },
-      'sources': [
-        '<@(libyuv_sources)',
-      ],
-    },  # target libyuv_nacl
-  ]
-}
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
deleted file mode 100644
index 87e7a5bb..00000000
--- a/files/libyuv_test.gyp
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2011 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-{
-  'variables': {
-    # Can be enabled if your jpeg has GYP support.
-    'libyuv_disable_jpeg%': 1,
-    'mips_msa%': 0,  # Default to msa off.
-  },
-  'targets': [
-    {
-      'target_name': 'libyuv_unittest',
-      'type': '<(gtest_target_type)',
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-        'testing/gtest.gyp:gtest',
-        'third_party/gflags/gflags.gyp:gflags',
-      ],
-      'direct_dependent_settings': {
-        'defines': [
-          'GTEST_RELATIVE_PATH',
-        ],
-      },
-      'export_dependent_settings': [
-        '<(DEPTH)/testing/gtest.gyp:gtest',
-      ],
-      'sources': [
-        # headers
-        'unit_test/unit_test.h',
-
-        # sources
-        'unit_test/basictypes_test.cc',
-        'unit_test/compare_test.cc',
-        'unit_test/color_test.cc',
-        'unit_test/convert_test.cc',
-        'unit_test/cpu_test.cc',
-        'unit_test/math_test.cc',
-        'unit_test/planar_test.cc',
-        'unit_test/rotate_argb_test.cc',
-        'unit_test/rotate_test.cc',
-        'unit_test/scale_argb_test.cc',
-        'unit_test/scale_test.cc',
-        'unit_test/unit_test.cc',
-        'unit_test/video_common_test.cc',
-      ],
-      'conditions': [
-        ['OS=="linux"', {
-          'cflags': [
-            '-fexceptions',
-          ],
-        }],
-        [ 'OS == "ios"', {
-          'xcode_settings': {
-            'DEBUGGING_SYMBOLS': 'YES',
-            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
-            # Work around compile issue with isosim.mm, see
-            # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
-            'WARNING_CFLAGS': [
-              '-Wno-sometimes-uninitialized',
-            ],
-          },
-          'cflags': [
-            '-Wno-sometimes-uninitialized',
-          ],
-        }],
-        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
-          'defines': [
-            'HAVE_JPEG',
-          ],
-        }],
-        ['OS=="android"', {
-          'dependencies': [
-            '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
-          ],
-        }],
-        # TODO(YangZhang): These lines can be removed when high accuracy
-        # YUV to RGB to Neon is ported.
-        [ '(target_arch == "armv7" or target_arch == "armv7s" \
-          or (target_arch == "arm" and arm_version >= 7) \
-          or target_arch == "arm64") \
-          and (arm_neon == 1 or arm_neon_optional == 1)', {
-          'defines': [
-            'LIBYUV_NEON'
-          ],
-        }],
-        [ '(target_arch == "mipsel" or target_arch == "mips64el") \
-          and (mips_msa == 1)', {
-          'defines': [
-            'LIBYUV_MSA'
-          ],
-        }],
-      ], # conditions
-      'defines': [
-        # Enable the following 3 macros to turn off assembly for specified CPU.
-        # 'LIBYUV_DISABLE_X86',
-        # 'LIBYUV_DISABLE_NEON',
-        # 'LIBYUV_DISABLE_DSPR2',
-        # Enable the following macro to build libyuv as a shared library (dll).
-        # 'LIBYUV_USING_SHARED_LIBRARY',
-      ],
-    },
-    {
-      'target_name': 'compare',
-      'type': 'executable',
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-      ],
-      'sources': [
-        # sources
-        'util/compare.cc',
-      ],
-      'conditions': [
-        ['OS=="linux"', {
-          'cflags': [
-            '-fexceptions',
-          ],
-        }],
-      ], # conditions
-    },
-    {
-      'target_name': 'yuvconvert',
-      'type': 'executable',
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-      ],
-      'sources': [
-        # sources
-        'util/yuvconvert.cc',
-      ],
-      'conditions': [
-        ['OS=="linux"', {
-          'cflags': [
-            '-fexceptions',
-          ],
-        }],
-      ], # conditions
-    },
-    # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
-    {
-      'target_name': 'psnr',
-      'type': 'executable',
-      'sources': [
-        # sources
-        'util/psnr_main.cc',
-        'util/psnr.cc',
-        'util/ssim.cc',
-      ],
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-      ],
-      'conditions': [
-        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
-          'defines': [
-            'HAVE_JPEG',
-          ],
-        }],
-      ], # conditions
-    },
-
-    {
-      'target_name': 'cpuid',
-      'type': 'executable',
-      'sources': [
-        # sources
-        'util/cpuid.c',
-      ],
-      'dependencies': [
-        'libyuv.gyp:libyuv',
-      ],
-    },
-  ], # targets
-  'conditions': [
-    ['OS=="android"', {
-      'targets': [
-        {
-          'target_name': 'yuv_unittest_apk',
-          'type': 'none',
-          'variables': {
-            'test_suite_name': 'yuv_unittest',
-            'input_shlib_path': '<(SHARED_LIB_DIR)/(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
-          },
-          'includes': [
-            'build/apk_test.gypi',
-          ],
-          'dependencies': [
-            'libyuv_unittest',
-          ],
-        },
-      ],
-    }],
-  ],
-}
-
-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/files/public.mk b/files/public.mk
deleted file mode 100644
index 1342307a..00000000
--- a/files/public.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file contains all the common make variables which are useful for
-# anyone depending on this library.
-# Note that dependencies on NDK are not directly listed since NDK auto adds
-# them.
-
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
-
-LIBYUV_C_FLAGS :=
-
-LIBYUV_CPP_FLAGS :=
-
-LIBYUV_LDLIBS :=
-LIBYUV_DEP_MODULES :=
diff --git a/files/setup_links.py b/files/setup_links.py
deleted file mode 100755
index b2b459e6..00000000
--- a/files/setup_links.py
+++ /dev/null
@@ -1,497 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Setup links to a Chromium checkout for WebRTC.
-
-WebRTC standalone shares a lot of dependencies and build tools with Chromium.
-To do this, many of the paths of a Chromium checkout is emulated by creating
-symlinks to files and directories. This script handles the setup of symlinks to
-achieve this.
-
-It also handles cleanup of the legacy Subversion-based approach that was used
-before Chrome switched over their master repo from Subversion to Git.
-"""
-
-
-import ctypes
-import errno
-import logging
-import optparse
-import os
-import shelve
-import shutil
-import subprocess
-import sys
-import textwrap
-
-
-DIRECTORIES = [
-  'build',
-  'buildtools',
-  'mojo',  # TODO(kjellander): Remove, see webrtc:5629.
-  'native_client',
-  'net',
-  'testing',
-  'third_party/binutils',
-  'third_party/drmemory',
-  'third_party/instrumented_libraries',
-  'third_party/libjpeg',
-  'third_party/libjpeg_turbo',
-  'third_party/llvm-build',
-  'third_party/lss',
-  'third_party/yasm',
-  'third_party/WebKit',  # TODO(kjellander): Remove, see webrtc:5629.
-  'tools/clang',
-  'tools/gn',
-  'tools/gyp',
-  'tools/memory',
-  'tools/python',
-  'tools/swarming_client',
-  'tools/valgrind',
-  'tools/vim',
-  'tools/win',
-]
-
-from sync_chromium import get_target_os_list
-target_os = get_target_os_list()
-if 'android' in target_os:
-  DIRECTORIES += [
-    'base',
-    'third_party/android_platform',
-    'third_party/android_tools',
-    'third_party/appurify-python',
-    'third_party/ashmem',
-    'third_party/catapult',
-    'third_party/icu',
-    'third_party/ijar',
-    'third_party/jsr-305',
-    'third_party/junit',
-    'third_party/libxml',
-    'third_party/mockito',
-    'third_party/modp_b64',
-    'third_party/protobuf',
-    'third_party/requests',
-    'third_party/robolectric',
-    'tools/android',
-    'tools/grit',
-  ]
-if 'ios' in target_os:
-  DIRECTORIES.append('third_party/class-dump')
-
-FILES = {
-  'tools/isolate_driver.py': None,
-  'third_party/BUILD.gn': None,
-}
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHROMIUM_CHECKOUT = os.path.join('chromium', 'src')
-LINKS_DB = 'links'
-
-# Version management to make future upgrades/downgrades easier to support.
-SCHEMA_VERSION = 1
-
-
-def query_yes_no(question, default=False):
-  """Ask a yes/no question via raw_input() and return their answer.
-
-  Modified from http://stackoverflow.com/a/3041990.
-  """
-  prompt = " [%s/%%s]: "
-  prompt = prompt % ('Y' if default is True  else 'y')
-  prompt = prompt % ('N' if default is False else 'n')
-
-  if default is None:
-    default = 'INVALID'
-
-  while True:
-    sys.stdout.write(question + prompt)
-    choice = raw_input().lower()
-    if choice == '' and default != 'INVALID':
-      return default
-
-    if 'yes'.startswith(choice):
-      return True
-    elif 'no'.startswith(choice):
-      return False
-
-    print "Please respond with 'yes' or 'no' (or 'y' or 'n')."
-
-
-# Actions
-class Action(object):
-  def __init__(self, dangerous):
-    self.dangerous = dangerous
-
-  def announce(self, planning):
-    """Log a description of this action.
-
-    Args:
-      planning - True iff we're in the planning stage, False if we're in the
-                 doit stage.
-    """
-    pass
-
-  def doit(self, links_db):
-    """Execute the action, recording what we did to links_db, if necessary."""
-    pass
-
-
-class Remove(Action):
-  def __init__(self, path, dangerous):
-    super(Remove, self).__init__(dangerous)
-    self._priority = 0
-    self._path = path
-
-  def announce(self, planning):
-    log = logging.warn
-    filesystem_type = 'file'
-    if not self.dangerous:
-      log = logging.info
-      filesystem_type = 'link'
-    if planning:
-      log('Planning to remove %s: %s', filesystem_type, self._path)
-    else:
-      log('Removing %s: %s', filesystem_type, self._path)
-
-  def doit(self, _):
-    os.remove(self._path)
-
-
-class Rmtree(Action):
-  def __init__(self, path):
-    super(Rmtree, self).__init__(dangerous=True)
-    self._priority = 0
-    self._path = path
-
-  def announce(self, planning):
-    if planning:
-      logging.warn('Planning to remove directory: %s', self._path)
-    else:
-      logging.warn('Removing directory: %s', self._path)
-
-  def doit(self, _):
-    if sys.platform.startswith('win'):
-      # shutil.rmtree() doesn't work on Windows if any of the directories are
-      # read-only, which svn repositories are.
-      subprocess.check_call(['rd', '/q', '/s', self._path], shell=True)
-    else:
-      shutil.rmtree(self._path)
-
-
-class Makedirs(Action):
-  def __init__(self, path):
-    super(Makedirs, self).__init__(dangerous=False)
-    self._priority = 1
-    self._path = path
-
-  def doit(self, _):
-    try:
-      os.makedirs(self._path)
-    except OSError as e:
-      if e.errno != errno.EEXIST:
-        raise
-
-
-class Symlink(Action):
-  def __init__(self, source_path, link_path):
-    super(Symlink, self).__init__(dangerous=False)
-    self._priority = 2
-    self._source_path = source_path
-    self._link_path = link_path
-
-  def announce(self, planning):
-    if planning:
-      logging.info(
-          'Planning to create link from %s to %s', self._link_path,
-          self._source_path)
-    else:
-      logging.debug(
-          'Linking from %s to %s', self._link_path, self._source_path)
-
-  def doit(self, links_db):
-    # Files not in the root directory need relative path calculation.
-    # On Windows, use absolute paths instead since NTFS doesn't seem to support
-    # relative paths for symlinks.
-    if sys.platform.startswith('win'):
-      source_path = os.path.abspath(self._source_path)
-    else:
-      if os.path.dirname(self._link_path) != self._link_path:
-        source_path = os.path.relpath(self._source_path,
-                                      os.path.dirname(self._link_path))
-
-    os.symlink(source_path, os.path.abspath(self._link_path))
-    links_db[self._source_path] = self._link_path
-
-
-class LinkError(IOError):
-  """Failed to create a link."""
-  pass
-
-
-# Handles symlink creation on the different platforms.
-if sys.platform.startswith('win'):
-  def symlink(source_path, link_path):
-    flag = 1 if os.path.isdir(source_path) else 0
-    if not ctypes.windll.kernel32.CreateSymbolicLinkW(
-        unicode(link_path), unicode(source_path), flag):
-      raise OSError('Failed to create symlink to %s. Notice that only NTFS '
-                    'version 5.0 and up has all the needed APIs for '
-                    'creating symlinks.' % source_path)
-  os.symlink = symlink
-
-
-class WebRTCLinkSetup(object):
-  def __init__(self, links_db, force=False, dry_run=False, prompt=False):
-    self._force = force
-    self._dry_run = dry_run
-    self._prompt = prompt
-    self._links_db = links_db
-
-  def CreateLinks(self, on_bot):
-    logging.debug('CreateLinks')
-    # First, make a plan of action
-    actions = []
-
-    for source_path, link_path in FILES.iteritems():
-      actions += self._ActionForPath(
-          source_path, link_path, check_fn=os.path.isfile, check_msg='files')
-    for source_dir in DIRECTORIES:
-      actions += self._ActionForPath(
-          source_dir, None, check_fn=os.path.isdir,
-          check_msg='directories')
-
-    if not on_bot and self._force:
-      # When making the manual switch from legacy SVN checkouts to the new
-      # Git-based Chromium DEPS, the .gclient_entries file that contains cached
-      # URLs for all DEPS entries must be removed to avoid future sync problems.
-      entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries')
-      if os.path.exists(entries_file):
-        actions.append(Remove(entries_file, dangerous=True))
-
-    actions.sort()
-
-    if self._dry_run:
-      for action in actions:
-        action.announce(planning=True)
-      logging.info('Not doing anything because dry-run was specified.')
-      sys.exit(0)
-
-    if any(a.dangerous for a in actions):
-      logging.warn('Dangerous actions:')
-      for action in (a for a in actions if a.dangerous):
-        action.announce(planning=True)
-      print
-
-      if not self._force:
-        logging.error(textwrap.dedent("""\
-        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-                              A C T I O N     R E Q I R E D
-        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
-        Because chromium/src is transitioning to Git (from SVN), we needed to
-        change the way that the WebRTC standalone checkout works. Instead of
-        individually syncing subdirectories of Chromium in SVN, we're now
-        syncing Chromium (and all of its DEPS, as defined by its own DEPS file),
-        into the `chromium/src` directory.
-
-        As such, all Chromium directories which are currently pulled by DEPS are
-        now replaced with a symlink into the full Chromium checkout.
-
-        To avoid disrupting developers, we've chosen to not delete your
-        directories forcibly, in case you have some work in progress in one of
-        them :).
-
-        ACTION REQUIRED:
-        Before running `gclient sync|runhooks` again, you must run:
-        %s%s --force
-
-        Which will replace all directories which now must be symlinks, after
-        prompting with a summary of the work-to-be-done.
-        """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0])
-        sys.exit(1)
-      elif self._prompt:
-        if not query_yes_no('Would you like to perform the above plan?'):
-          sys.exit(1)
-
-    for action in actions:
-      action.announce(planning=False)
-      action.doit(self._links_db)
-
-    if not on_bot and self._force:
-      logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to '
-                   'let the remaining hooks (that probably were interrupted) '
-                   'execute.')
-
-  def CleanupLinks(self):
-    logging.debug('CleanupLinks')
-    for source, link_path  in self._links_db.iteritems():
-      if source == 'SCHEMA_VERSION':
-        continue
-      if os.path.islink(link_path) or sys.platform.startswith('win'):
-        # os.path.islink() always returns false on Windows
-        # See http://bugs.python.org/issue13143.
-        logging.debug('Removing link to %s at %s', source, link_path)
-        if not self._dry_run:
-          if os.path.exists(link_path):
-            if sys.platform.startswith('win') and os.path.isdir(link_path):
-              subprocess.check_call(['rmdir', '/q', '/s', link_path],
-                                    shell=True)
-            else:
-              os.remove(link_path)
-          del self._links_db[source]
-
-  @staticmethod
-  def _ActionForPath(source_path, link_path=None, check_fn=None,
-                     check_msg=None):
-    """Create zero or more Actions to link to a file or directory.
-
-    This will be a symlink on POSIX platforms. On Windows this requires
-    that NTFS is version 5.0 or higher (Vista or newer).
-
-    Args:
-      source_path: Path relative to the Chromium checkout root.
-        For readability, the path may contain slashes, which will
-        automatically be converted to the right path delimiter on Windows.
-      link_path: The location for the link to create. If omitted it will be the
-        same path as source_path.
-      check_fn: A function returning true if the type of filesystem object is
-        correct for the attempted call. Otherwise an error message with
-        check_msg will be printed.
-      check_msg: String used to inform the user of an invalid attempt to create
-        a file.
-    Returns:
-      A list of Action objects.
-    """
-    def fix_separators(path):
-      if sys.platform.startswith('win'):
-        return path.replace(os.altsep, os.sep)
-      else:
-        return path
-
-    assert check_fn
-    assert check_msg
-    link_path = link_path or source_path
-    link_path = fix_separators(link_path)
-
-    source_path = fix_separators(source_path)
-    source_path = os.path.join(CHROMIUM_CHECKOUT, source_path)
-    if os.path.exists(source_path) and not check_fn:
-      raise LinkError('_LinkChromiumPath can only be used to link to %s: '
-                      'Tried to link to: %s' % (check_msg, source_path))
-
-    if not os.path.exists(source_path):
-      logging.debug('Silently ignoring missing source: %s. This is to avoid '
-                    'errors on platform-specific dependencies.', source_path)
-      return []
-
-    actions = []
-
-    if os.path.exists(link_path) or os.path.islink(link_path):
-      if os.path.islink(link_path):
-        actions.append(Remove(link_path, dangerous=False))
-      elif os.path.isfile(link_path):
-        actions.append(Remove(link_path, dangerous=True))
-      elif os.path.isdir(link_path):
-        actions.append(Rmtree(link_path))
-      else:
-        raise LinkError('Don\'t know how to plan: %s' % link_path)
-
-    # Create parent directories to the target link if needed.
-    target_parent_dirs = os.path.dirname(link_path)
-    if (target_parent_dirs and
-        target_parent_dirs != link_path and
-        not os.path.exists(target_parent_dirs)):
-      actions.append(Makedirs(target_parent_dirs))
-
-    actions.append(Symlink(source_path, link_path))
-
-    return actions
-
-def _initialize_database(filename):
-  links_database = shelve.open(filename)
-
-  # Wipe the database if this version of the script ends up looking at a
-  # newer (future) version of the links db, just to be sure.
-  version = links_database.get('SCHEMA_VERSION')
-  if version and version != SCHEMA_VERSION:
-    logging.info('Found database with schema version %s while this script only '
-                 'supports %s. Wiping previous database contents.', version,
-                 SCHEMA_VERSION)
-    links_database.clear()
-  links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
-  return links_database
-
-
-def main():
-  on_bot = os.environ.get('CHROME_HEADLESS') == '1'
-
-  parser = optparse.OptionParser()
-  parser.add_option('-d', '--dry-run', action='store_true', default=False,
-                    help='Print what would be done, but don\'t perform any '
-                         'operations. This will automatically set logging to '
-                         'verbose.')
-  parser.add_option('-c', '--clean-only', action='store_true', default=False,
-                    help='Only clean previously created links, don\'t create '
-                         'new ones. This will automatically set logging to '
-                         'verbose.')
-  parser.add_option('-f', '--force', action='store_true', default=on_bot,
-                    help='Force link creation. CAUTION: This deletes existing '
-                         'folders and files in the locations where links are '
-                         'about to be created.')
-  parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt',
-                    default=(not on_bot),
-                    help='Prompt if we\'re planning to do a dangerous action')
-  parser.add_option('-v', '--verbose', action='store_const',
-                    const=logging.DEBUG, default=logging.INFO,
-                    help='Print verbose output for debugging.')
-  options, _ = parser.parse_args()
-
-  if options.dry_run or options.force or options.clean_only:
-    options.verbose = logging.DEBUG
-  logging.basicConfig(format='%(message)s', level=options.verbose)
-
-  # Work from the root directory of the checkout.
-  script_dir = os.path.dirname(os.path.abspath(__file__))
-  os.chdir(script_dir)
-
-  if sys.platform.startswith('win'):
-    def is_admin():
-      try:
-        return os.getuid() == 0
-      except AttributeError:
-        return ctypes.windll.shell32.IsUserAnAdmin() != 0
-    if not is_admin():
-      logging.error('On Windows, you now need to have administrator '
-                    'privileges for the shell running %s (or '
-                    '`gclient sync|runhooks`).\nPlease start another command '
-                    'prompt as Administrator and try again.', sys.argv[0])
-      return 1
-
-  if not os.path.exists(CHROMIUM_CHECKOUT):
-    logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient '
-                  'sync" before running this script?', CHROMIUM_CHECKOUT)
-    return 2
-
-  links_database = _initialize_database(LINKS_DB)
-  try:
-    symlink_creator = WebRTCLinkSetup(links_database, options.force,
-                                      options.dry_run, options.prompt)
-    symlink_creator.CleanupLinks()
-    if not options.clean_only:
-      symlink_creator.CreateLinks(on_bot)
-  except LinkError as e:
-    print >> sys.stderr, e.message
-    return 3
-  finally:
-    links_database.close()
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
deleted file mode 100644
index 676527c1..00000000
--- a/files/source/compare_gcc.cc
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-#if defined(__x86_64__)
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint64_t diff = 0u;
-
-  asm volatile(
-      "xor        %3,%3                          \n"
-      "xor        %%r8,%%r8                      \n"
-      "xor        %%r9,%%r9                      \n"
-      "xor        %%r10,%%r10                    \n"
-
-      // Process 32 bytes per loop.
-      LABELALIGN
-      "1:                                        \n"
-      "mov        (%0),%%rcx                     \n"
-      "mov        0x8(%0),%%rdx                  \n"
-      "xor        (%1),%%rcx                     \n"
-      "xor        0x8(%1),%%rdx                  \n"
-      "popcnt     %%rcx,%%rcx                    \n"
-      "popcnt     %%rdx,%%rdx                    \n"
-      "mov        0x10(%0),%%rsi                 \n"
-      "mov        0x18(%0),%%rdi                 \n"
-      "xor        0x10(%1),%%rsi                 \n"
-      "xor        0x18(%1),%%rdi                 \n"
-      "popcnt     %%rsi,%%rsi                    \n"
-      "popcnt     %%rdi,%%rdi                    \n"
-      "add        $0x20,%0                       \n"
-      "add        $0x20,%1                       \n"
-      "add        %%rcx,%3                       \n"
-      "add        %%rdx,%%r8                     \n"
-      "add        %%rsi,%%r9                     \n"
-      "add        %%rdi,%%r10                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-
-      "add        %%r8, %3                       \n"
-      "add        %%r9, %3                       \n"
-      "add        %%r10, %3                      \n"
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "=r"(diff)    // %3
-      :
-      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
-
-  return static_cast<uint32_t>(diff);
-}
-#else
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      // Process 16 bytes per loop.
-      LABELALIGN
-      "1:                                        \n"
-      "mov        (%0),%%ecx                     \n"
-      "mov        0x4(%0),%%edx                  \n"
-      "xor        (%1),%%ecx                     \n"
-      "xor        0x4(%1),%%edx                  \n"
-      "popcnt     %%ecx,%%ecx                    \n"
-      "add        %%ecx,%3                       \n"
-      "popcnt     %%edx,%%edx                    \n"
-      "add        %%edx,%3                       \n"
-      "mov        0x8(%0),%%ecx                  \n"
-      "mov        0xc(%0),%%edx                  \n"
-      "xor        0x8(%1),%%ecx                  \n"
-      "xor        0xc(%1),%%edx                  \n"
-      "popcnt     %%ecx,%%ecx                    \n"
-      "add        %%ecx,%3                       \n"
-      "popcnt     %%edx,%%edx                    \n"
-      "add        %%edx,%3                       \n"
-      "add        $0x10,%0                       \n"
-      "add        $0x10,%1                       \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "+r"(diff)    // %3
-      :
-      : "memory", "cc", "ecx", "edx");
-
-  return diff;
-}
-#endif
-
-static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
-                                 15, 15, 15, 15, 15, 15, 15, 15};
-static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      "movdqa     %4,%%xmm2                      \n"
-      "movdqa     %5,%%xmm3                      \n"
-      "pxor       %%xmm0,%%xmm0                  \n"
-      "pxor       %%xmm1,%%xmm1                  \n"
-      "sub        %0,%1                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqa     (%0),%%xmm4                    \n"
-      "movdqa     0x10(%0), %%xmm5               \n"
-      "pxor       (%0,%1), %%xmm4                \n"
-      "movdqa     %%xmm4,%%xmm6                  \n"
-      "pand       %%xmm2,%%xmm6                  \n"
-      "psrlw      $0x4,%%xmm4                    \n"
-      "movdqa     %%xmm3,%%xmm7                  \n"
-      "pshufb     %%xmm6,%%xmm7                  \n"
-      "pand       %%xmm2,%%xmm4                  \n"
-      "movdqa     %%xmm3,%%xmm6                  \n"
-      "pshufb     %%xmm4,%%xmm6                  \n"
-      "paddb      %%xmm7,%%xmm6                  \n"
-      "pxor       0x10(%0,%1),%%xmm5             \n"
-      "add        $0x20,%0                       \n"
-      "movdqa     %%xmm5,%%xmm4                  \n"
-      "pand       %%xmm2,%%xmm5                  \n"
-      "psrlw      $0x4,%%xmm4                    \n"
-      "movdqa     %%xmm3,%%xmm7                  \n"
-      "pshufb     %%xmm5,%%xmm7                  \n"
-      "pand       %%xmm2,%%xmm4                  \n"
-      "movdqa     %%xmm3,%%xmm5                  \n"
-      "pshufb     %%xmm4,%%xmm5                  \n"
-      "paddb      %%xmm7,%%xmm5                  \n"
-      "paddb      %%xmm5,%%xmm6                  \n"
-      "psadbw     %%xmm1,%%xmm6                  \n"
-      "paddd      %%xmm6,%%xmm0                  \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-
-      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
-      "paddd      %%xmm1,%%xmm0                  \n"
-      "movd       %%xmm0, %3                     \n"
-      : "+r"(src_a),       // %0
-        "+r"(src_b),       // %1
-        "+r"(count),       // %2
-        "=r"(diff)         // %3
-      : "m"(kNibbleMask),  // %4
-        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-
-  return diff;
-}
-
-#ifdef HAS_HAMMINGDISTANCE_AVX2
-uint32_t HammingDistance_AVX2(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff = 0u;
-
-  asm volatile(
-      "vbroadcastf128 %4,%%ymm2                  \n"
-      "vbroadcastf128 %5,%%ymm3                  \n"
-      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
-      "sub        %0,%1                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqa    (%0),%%ymm4                    \n"
-      "vmovdqa    0x20(%0), %%ymm5               \n"
-      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
-      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
-      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
-      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
-      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
-      "add        $0x40,%0                       \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
-      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
-      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
-      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
-      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
-      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
-      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
-      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
-      "sub        $0x40,%2                       \n"
-      "jg         1b                             \n"
-
-      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
-      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
-      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovd      %%xmm0, %3                     \n"
-      "vzeroupper                                \n"
-      : "+r"(src_a),       // %0
-        "+r"(src_b),       // %1
-        "+r"(count),       // %2
-        "=r"(diff)         // %3
-      : "m"(kNibbleMask),  // %4
-        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-
-  return diff;
-}
-#endif  // HAS_HAMMINGDISTANCE_AVX2
-
-uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "pxor      %%xmm0,%%xmm0                   \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm1,%%xmm3                   \n"
-      "psubusb   %%xmm2,%%xmm1                   \n"
-      "psubusb   %%xmm3,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpckhbw %%xmm5,%%xmm2                   \n"
-      "pmaddwd   %%xmm1,%%xmm1                   \n"
-      "pmaddwd   %%xmm2,%%xmm2                   \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-
-      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "movd      %%xmm0,%3                       \n"
-
-      : "+r"(src_a),  // %0
-        "+r"(src_b),  // %1
-        "+r"(count),  // %2
-        "=g"(sse)     // %3
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-  return sse;
-}
-
-static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
-static const uvec32 kHashMul0 = {
-    0x0c3525e1,  // 33 ^ 15
-    0xa3476dc1,  // 33 ^ 14
-    0x3b4039a1,  // 33 ^ 13
-    0x4f5f0981,  // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
-    0x30f35d61,  // 33 ^ 11
-    0x855cb541,  // 33 ^ 10
-    0x040a9121,  // 33 ^ 9
-    0x747c7101,  // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
-    0xec41d4e1,  // 33 ^ 7
-    0x4cfa3cc1,  // 33 ^ 6
-    0x025528a1,  // 33 ^ 5
-    0x00121881,  // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
-    0x00008c61,  // 33 ^ 3
-    0x00000441,  // 33 ^ 2
-    0x00000021,  // 33 ^ 1
-    0x00000001,  // 33 ^ 0
-};
-
-uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
-  uint32_t hash;
-  asm volatile(
-      "movd      %2,%%xmm0                       \n"
-      "pxor      %%xmm7,%%xmm7                   \n"
-      "movdqa    %4,%%xmm6                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "pmulld    %%xmm6,%%xmm0                   \n"
-      "movdqa    %5,%%xmm5                       \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm7,%%xmm3                   \n"
-      "pmulld    %%xmm5,%%xmm3                   \n"
-      "movdqa    %6,%%xmm5                       \n"
-      "movdqa    %%xmm2,%%xmm4                   \n"
-      "punpckhwd %%xmm7,%%xmm4                   \n"
-      "pmulld    %%xmm5,%%xmm4                   \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "punpckhbw %%xmm7,%%xmm1                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklwd %%xmm7,%%xmm2                   \n"
-      "pmulld    %%xmm5,%%xmm2                   \n"
-      "movdqa    %8,%%xmm5                       \n"
-      "punpckhwd %%xmm7,%%xmm1                   \n"
-      "pmulld    %%xmm5,%%xmm1                   \n"
-      "paddd     %%xmm4,%%xmm3                   \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm1                   \n"
-      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-      "paddd     %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm1,%%xmm0                   \n"
-      "sub       $0x10,%1                        \n"
-      "jg        1b                              \n"
-      "movd      %%xmm0,%3                       \n"
-      : "+r"(src),        // %0
-        "+r"(count),      // %1
-        "+rm"(seed),      // %2
-        "=g"(hash)        // %3
-      : "m"(kHash16x33),  // %4
-        "m"(kHashMul0),   // %5
-        "m"(kHashMul1),   // %6
-        "m"(kHashMul2),   // %7
-        "m"(kHashMul3)    // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-  return hash;
-}
-#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc
deleted file mode 100644
index 7640d946..00000000
--- a/files/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-
-  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
-  uint64_t c1 = 0x5555555555555555;
-  uint64_t c2 = 0x3333333333333333;
-  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
-  uint32_t c4 = 0x01010101;
-  uint64_t s1 = 1, s2 = 2, s3 = 4;
-  __asm__ volatile(
-      "1:	\n\t"
-      "ldc1   %[ta],    0(%[src_a])          \n\t"
-      "ldc1   %[tb],    0(%[src_b])          \n\t"
-      "xor    %[temp],  %[ta],      %[tb]    \n\t"
-      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
-      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
-      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
-      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
-      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
-      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
-      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
-      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "dsrl32 $t0,      $t0,        0        \n\t "
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "daddiu %[src_a], %[src_a],   8        \n\t"
-      "daddiu %[src_b], %[src_b],   8        \n\t"
-      "addiu  %[count], %[count],  -8        \n\t"
-      "bgtz   %[count], 1b \n\t"
-      "nop                            \n\t"
-      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
-        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
-        [temp1] "+f"(temp1)
-      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
-        [s2] "f"(s2), [s3] "f"(s3)
-      : "memory");
-  return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  uint32_t sse_hi = 0u, sse_lo = 0u;
-
-  uint64_t src1, src2;
-  uint64_t diff, diff_hi, diff_lo;
-  uint64_t sse_sum, sse_tmp;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
-
-      "1:                                                           \n\t"
-      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
-      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
-      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
-      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
-      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-
-      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
-      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
-      "daddiu     %[count],        %[count],         -0x08          \n\t"
-      "bnez       %[count],        1b                               \n\t"
-
-      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
-      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
-      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
-      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
-        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
-        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
-        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
-      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
-        [mask] "f"(mask)
-      : "memory");
-
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
deleted file mode 100644
index 2a2181e0..00000000
--- a/files/source/compare_neon.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff;
-
-  asm volatile(
-      "vmov.u16   q4, #0                         \n"  // accumulator
-
-      "1:                                        \n"
-      "vld1.8     {q0, q1}, [%0]!                \n"
-      "vld1.8     {q2, q3}, [%1]!                \n"
-      "veor.32    q0, q0, q2                     \n"
-      "veor.32    q1, q1, q3                     \n"
-      "vcnt.i8    q0, q0                         \n"
-      "vcnt.i8    q1, q1                         \n"
-      "subs       %2, %2, #32                    \n"
-      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
-      "vpadal.u8  q4, q0                         \n"  // 8 shorts
-      "bgt        1b                             \n"
-
-      "vpaddl.u16 q0, q4                         \n"  // 4 ints
-      "vpadd.u32  d0, d0, d1                     \n"
-      "vpadd.u32  d0, d0, d0                     \n"
-      "vmov.32    %3, d0[0]                      \n"
-
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
-      :
-      : "cc", "q0", "q1", "q2", "q3", "q4");
-  return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "vmov.u8    q8, #0                         \n"
-      "vmov.u8    q10, #0                        \n"
-      "vmov.u8    q9, #0                         \n"
-      "vmov.u8    q11, #0                        \n"
-
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"
-      "vld1.8     {q1}, [%1]!                    \n"
-      "subs       %2, %2, #16                    \n"
-      "vsubl.u8   q2, d0, d2                     \n"
-      "vsubl.u8   q3, d1, d3                     \n"
-      "vmlal.s16  q8, d4, d4                     \n"
-      "vmlal.s16  q9, d6, d6                     \n"
-      "vmlal.s16  q10, d5, d5                    \n"
-      "vmlal.s16  q11, d7, d7                    \n"
-      "bgt        1b                             \n"
-
-      "vadd.u32   q8, q8, q9                     \n"
-      "vadd.u32   q10, q10, q11                  \n"
-      "vadd.u32   q11, q8, q10                   \n"
-      "vpaddl.u32 q1, q11                        \n"
-      "vadd.u64   d0, d2, d3                     \n"
-      "vmov.32    %3, d0[0]                      \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-  return sse;
-}
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
deleted file mode 100644
index 6e8f672a..00000000
--- a/files/source/compare_neon64.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// 256 bits at a time
-// uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
-  uint32_t diff;
-  asm volatile(
-      "movi       v4.8h, #0                      \n"
-
-      "1:                                        \n"
-      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
-      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
-      "eor        v0.16b, v0.16b, v2.16b         \n"
-      "eor        v1.16b, v1.16b, v3.16b         \n"
-      "cnt        v0.16b, v0.16b                 \n"
-      "cnt        v1.16b, v1.16b                 \n"
-      "subs       %w2, %w2, #32                  \n"
-      "add        v0.16b, v0.16b, v1.16b         \n"
-      "uadalp     v4.8h, v0.16b                  \n"
-      "b.gt       1b                             \n"
-
-      "uaddlv     s4, v4.8h                      \n"
-      "fmov       %w3, s4                        \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
-      :
-      : "cc", "v0", "v1", "v2", "v3", "v4");
-  return diff;
-}
-
-uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t sse;
-  asm volatile(
-      "eor        v16.16b, v16.16b, v16.16b      \n"
-      "eor        v18.16b, v18.16b, v18.16b      \n"
-      "eor        v17.16b, v17.16b, v17.16b      \n"
-      "eor        v19.16b, v19.16b, v19.16b      \n"
-
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"
-      "ld1        {v1.16b}, [%1], #16            \n"
-      "subs       %w2, %w2, #16                  \n"
-      "usubl      v2.8h, v0.8b, v1.8b            \n"
-      "usubl2     v3.8h, v0.16b, v1.16b          \n"
-      "smlal      v16.4s, v2.4h, v2.4h           \n"
-      "smlal      v17.4s, v3.4h, v3.4h           \n"
-      "smlal2     v18.4s, v2.8h, v2.8h           \n"
-      "smlal2     v19.4s, v3.8h, v3.8h           \n"
-      "b.gt       1b                             \n"
-
-      "add        v16.4s, v16.4s, v17.4s         \n"
-      "add        v18.4s, v18.4s, v19.4s         \n"
-      "add        v19.4s, v16.4s, v18.4s         \n"
-      "addv       s0, v19.4s                     \n"
-      "fmov       %w3, s0                        \n"
-      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
-      :
-      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/convert.cc b/files/source/convert.cc
deleted file mode 100644
index 614fa482..00000000
--- a/files/source/convert.cc
+++ /dev/null
@@ -1,2576 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int src_y_width,
-                      int src_y_height,
-                      int src_uv_width,
-                      int src_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
-  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (src_uv_width == 0 || src_uv_height == 0) {
-    return -1;
-  }
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
-}
-
-// Copy I420 with optional flipping.
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8_t* src_y,
-             int src_stride_y,
-             const uint8_t* src_u,
-             int src_stride_u,
-             const uint8_t* src_v,
-             int src_stride_v,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             uint8_t* dst_u,
-             int dst_stride_u,
-             uint8_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  // Copy UV planes.
-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// Copy I010 with optional flipping.
-LIBYUV_API
-int I010Copy(const uint16_t* src_y,
-             int src_stride_y,
-             const uint16_t* src_u,
-             int src_stride_u,
-             const uint16_t* src_v,
-             int src_stride_v,
-             uint16_t* dst_y,
-             int dst_stride_y,
-             uint16_t* dst_u,
-             int dst_stride_u,
-             uint16_t* dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  // Copy UV planes.
-  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-  return 0;
-}
-
-// Convert 10 bit YUV to 8 bit.
-LIBYUV_API
-int I010ToI420(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Convert Y plane.
-  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
-                    height);
-  // Convert UV planes.
-  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
-                    halfheight);
-  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
-                    halfheight);
-  return 0;
-}
-
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I422ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, src_uv_width, height);
-}
-
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
-}
-
-#ifdef I422TONV21_ROW_VERSION
-// Unittest fails for this version.
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-// Swap src_u and src_v to implement I422ToNV12
-LIBYUV_API
-int I422ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                     uint8_t* dst_uv, int width) = MergeUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MergeUVRow = MergeUVRow_Any_MSA;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow = MergeUVRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
-    }
-  }
-#endif
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
-  }
-  {
-    // Allocate 2 rows of vu.
-    int awidth = halfwidth * 2;
-    align_buffer_64(row_vu_0, awidth * 2);
-    uint8_t* row_vu_1 = row_vu_0 + awidth;
-
-    for (y = 0; y < height - 1; y += 2) {
-      MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
-      MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
-                 halfwidth);
-      InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
-      src_u += src_stride_u * 2;
-      src_v += src_stride_v * 2;
-      dst_vu += dst_stride_vu;
-    }
-    if (height & 1) {
-      MergeUVRow(src_v, src_u, dst_vu, halfwidth);
-    }
-    free_aligned_buffer_64(row_vu_0);
-  }
-  return 0;
-}
-#endif  // I422TONV21_ROW_VERSION
-
-// 444 chroma is 1x width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I444ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, width, height);
-}
-
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
-  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
-  return 0;
-}
-
-// I400 is greyscale typically used in MJPG
-LIBYUV_API
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!dst_vu || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
-  return 0;
-}
-
-static void CopyPlane2(const uint8_t* src,
-                       int src_stride_0,
-                       int src_stride_1,
-                       uint8_t* dst,
-                       int dst_stride,
-                       int width,
-                       int height) {
-  int y;
-  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src, dst, width);
-    CopyRow(src + src_stride_0, dst + dst_stride, width);
-    src += src_stride_0 + src_stride_1;
-    dst += dst_stride * 2;
-  }
-  if (height & 1) {
-    CopyRow(src, dst, width);
-  }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
-//   this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
-                      int src_stride_y0,
-                      int src_stride_y1,
-                      const uint8_t* src_uv,
-                      int src_stride_uv,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int width,
-                      int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    if (dst_y) {
-      dst_y = dst_y + (height - 1) * dst_stride_y;
-    }
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // Coalesce rows.
-  if (src_stride_y0 == width && src_stride_y1 == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
-      dst_stride_v == halfwidth) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
-  }
-
-  if (dst_y) {
-    if (src_stride_y0 == src_stride_y1) {
-      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
-    } else {
-      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
-                 width, height);
-    }
-  }
-
-  // Split UV plane - NV12 / NV21
-  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
-               halfwidth, halfheight);
-
-  return 0;
-}
-
-// Convert NV12 to I420.
-LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
-                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                    dst_stride_v, width, height);
-}
-
-// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
-LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
-                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
-                    dst_stride_u, width, height);
-}
-
-// Convert M420 to I420.
-LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
-                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-                    width, height);
-}
-
-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
-      YUY2ToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_SSE2;
-      YUY2ToYRow = YUY2ToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUVRow = YUY2ToUVRow_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUVRow = YUY2ToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MSA;
-    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToYRow = YUY2ToYRow_MSA;
-      YUY2ToUVRow = YUY2ToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_MMI;
-      }
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
-    src_yuy2 += src_stride_yuy2 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
-      UYVYToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-#if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-    UYVYToYRow = UYVYToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_SSE2;
-      UYVYToYRow = UYVYToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
-    UYVYToYRow = UYVYToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToUVRow = UYVYToUVRow_AVX2;
-      UYVYToYRow = UYVYToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToYRow = UYVYToYRow_Any_NEON;
-    UYVYToUVRow = UYVYToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_NEON;
-      UYVYToUVRow = UYVYToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    UYVYToYRow = UYVYToYRow_Any_MSA;
-    UYVYToUVRow = UYVYToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToYRow = UYVYToYRow_MSA;
-      UYVYToUVRow = UYVYToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    UYVYToYRow = UYVYToYRow_Any_MMI;
-    UYVYToUVRow = UYVYToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_MMI;
-      UYVYToUVRow = UYVYToUVRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
-    src_uyvy += src_stride_uyvy * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
-    UYVYToYRow(src_uyvy, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert AYUV to NV12.
-LIBYUV_API
-int AYUVToNV12(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int y;
-  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
-                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
-  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
-      AYUVToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
-    src_stride_ayuv = -src_stride_ayuv;
-  }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
-    AYUVToYRow = AYUVToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToUVRow = AYUVToUVRow_SSE2;
-      AYUVToYRow = AYUVToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
-    AYUVToYRow = AYUVToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      AYUVToUVRow = AYUVToUVRow_AVX2;
-      AYUVToYRow = AYUVToYRow_AVX2;
-    }
-  }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AYUVToYRow = AYUVToYRow_Any_NEON;
-    AYUVToUVRow = AYUVToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToYRow = AYUVToYRow_NEON;
-      AYUVToUVRow = AYUVToUVRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
-    src_ayuv += src_stride_ayuv * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
-  }
-  if (height & 1) {
-    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert AYUV to NV21.
-LIBYUV_API
-int AYUVToNV21(const uint8_t* src_ayuv,
-               int src_stride_ayuv,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  int y;
-  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
-                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
-  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
-      AYUVToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
-    src_stride_ayuv = -src_stride_ayuv;
-  }
-// place holders for future intel code
-#if defined(HAS_AYUVTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    AYUVToVURow = AYUVToVURow_Any_SSE2;
-    AYUVToYRow = AYUVToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToVURow = AYUVToVURow_SSE2;
-      AYUVToYRow = AYUVToYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_AYUVTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    AYUVToVURow = AYUVToVURow_Any_AVX2;
-    AYUVToYRow = AYUVToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      AYUVToVURow = AYUVToVURow_AVX2;
-      AYUVToYRow = AYUVToYRow_AVX2;
-    }
-  }
-#endif
-
-#if defined(HAS_AYUVTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    AYUVToYRow = AYUVToYRow_Any_NEON;
-    AYUVToVURow = AYUVToVURow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      AYUVToYRow = AYUVToYRow_NEON;
-      AYUVToVURow = AYUVToVURow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
-    src_ayuv += src_stride_ayuv * 2;
-    dst_y += dst_stride_y * 2;
-    dst_vu += dst_stride_vu;
-  }
-  if (height & 1) {
-    AYUVToVURow(src_ayuv, 0, dst_vu, width);
-    AYUVToYRow(src_ayuv, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert BGRA to I420.
-LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
-      BGRAToYRow_C;
-  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
-    src_stride_bgra = -src_stride_bgra;
-  }
-#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_SSSE3;
-      BGRAToYRow = BGRAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    BGRAToYRow = BGRAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      BGRAToYRow = BGRAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    BGRAToUVRow = BGRAToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    BGRAToYRow = BGRAToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToYRow = BGRAToYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    BGRAToUVRow = BGRAToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BGRAToYRow = BGRAToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      BGRAToYRow = BGRAToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_BGRATOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BGRAToUVRow = BGRAToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
-    src_bgra += src_stride_bgra * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
-    BGRAToYRow(src_bgra, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
-      ABGRToYRow_C;
-  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-    src_stride_abgr = -src_stride_abgr;
-  }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
-      ABGRToYRow = ABGRToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ABGRToUVRow = ABGRToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToYRow = ABGRToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToYRow = ABGRToYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ABGRToUVRow = ABGRToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToYRow = ABGRToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToUVRow = ABGRToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-    src_abgr += src_stride_abgr * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
-    ABGRToYRow(src_abgr, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGBA to I420.
-LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int y;
-  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
-      RGBAToYRow_C;
-  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
-    src_stride_rgba = -src_stride_rgba;
-  }
-#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
-    RGBAToYRow = RGBAToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_SSSE3;
-      RGBAToYRow = RGBAToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToYRow = RGBAToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYRow = RGBAToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBAToUVRow = RGBAToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToYRow = RGBAToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToYRow = RGBAToYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGBAToUVRow = RGBAToUVRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGBAToYRow = RGBAToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGBAToYRow = RGBAToYRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_RGBATOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGBAToUVRow = RGBAToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
-    src_rgba += src_stride_rgba * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
-    RGBAToYRow(src_rgba, dst_y, width);
-  }
-  return 0;
-}
-
-// Convert RGB24 to I420.
-LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
-  int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
-      RGB24ToYRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
-    RGB24ToYRow = RGB24ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYRow = RGB24ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_RGB24TOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
-    RGB24ToYRow = RGB24ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYRow = RGB24ToYRow_MSA;
-      RGB24ToUVRow = RGB24ToUVRow_MSA;
-    }
-  }
-#elif defined(HAS_RGB24TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
-    RGB24ToYRow = RGB24ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYRow = RGB24ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-      defined(HAS_RGB24TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-     defined(HAS_RGB24TOYROW_MMI))
-      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
-      defined(HAS_RGB24TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// TODO(fbarchard): Use Matrix version to implement I420 and J420.
-// Convert RGB24 to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
-  int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                        uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB24ToUVJRow_C;
-  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
-      RGB24ToYJRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
-    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVJRow = RGB24ToUVJRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_RGB24TOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
-    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_MSA;
-      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
-    }
-  }
-#elif defined(HAS_RGB24TOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
-    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVJRow = RGB24ToUVJRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_AVX2;
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-      defined(HAS_RGB24TOYJROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-      defined(HAS_RGB24TOYJROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RAW to I420.
-LIBYUV_API
-int RAWToI420(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_u,
-              int dst_stride_u,
-              uint8_t* dst_v,
-              int dst_stride_v,
-              int width,
-              int height) {
-  int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_MMI))
-  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
-                     uint8_t* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      RAWToYRow_C;
-#else
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToUVRow = RAWToUVRow_Any_NEON;
-    RAWToYRow = RAWToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYRow = RAWToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_RAWTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToUVRow = RAWToUVRow_Any_MSA;
-    RAWToYRow = RAWToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYRow = RAWToYRow_MSA;
-      RAWToUVRow = RAWToUVRow_MSA;
-    }
-  }
-#elif defined(HAS_RAWTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToUVRow = RAWToUVRow_Any_MMI;
-    RAWToYRow = RAWToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToYRow = RAWToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RAW to ARGB.
-#else
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-      defined(HAS_RAWTOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_MMI))
-      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-     defined(HAS_RAWTOYROW_MMI))
-      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
-      defined(HAS_RAWTOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height) {
-  int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
-                        uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
-      RGB565ToYRow_C;
-#else
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                          int width) = RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-
-// Neon version does direct RGB565 to YUV.
-#if defined(HAS_RGB565TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
-    RGB565ToYRow = RGB565ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_RGB565TOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
-    RGB565ToYRow = RGB565ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToYRow = RGB565ToYRow_MSA;
-      RGB565ToUVRow = RGB565ToUVRow_MSA;
-    }
-  }
-#elif defined(HAS_RGB565TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
-    RGB565ToYRow = RGB565ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from RGB565 to ARGB.
-#else
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-  {
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-      defined(HAS_RGB565TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_rgb565 += src_stride_rgb565 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-     defined(HAS_RGB565TOYROW_MMI))
-      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
-      defined(HAS_RGB565TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert ARGB1555 to I420.
-LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height) {
-  int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
-                          uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
-                         int width) = ARGB1555ToYRow_C;
-#else
-  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                            int width) = ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-
-// Neon version does direct ARGB1555 to YUV.
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_ARGB1555TOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
-      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
-    }
-  }
-#elif defined(HAS_ARGB1555TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
-    ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToYRow = ARGB1555ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from ARGB1555 to ARGB.
-#else
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#endif
-  {
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-      defined(HAS_ARGB1555TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb1555 += src_stride_argb1555 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-     defined(HAS_ARGB1555TOYROW_MMI))
-      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
-      defined(HAS_ARGB1555TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert ARGB4444 to I420.
-LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_y,
-                   int dst_stride_y,
-                   uint8_t* dst_u,
-                   int dst_stride_u,
-                   uint8_t* dst_v,
-                   int dst_stride_v,
-                   int width,
-                   int height) {
-  int y;
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
-                          uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
-                         int width) = ARGB4444ToYRow_C;
-#else
-  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
-                            int width) = ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-#endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-
-// Neon version does direct ARGB4444 to YUV.
-#if defined(HAS_ARGB4444TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
-    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
-      }
-    }
-  }
-#elif defined(HAS_ARGB4444TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
-    ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToYRow = ARGB4444ToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
-      }
-    }
-  }
-// Other platforms do intermediate conversion from ARGB4444 to ARGB.
-#else
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
-    ARGBToYRow = ARGBToYRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_MSA;
-      if (IS_ALIGNED(width, 32)) {
-        ARGBToUVRow = ARGBToUVRow_MSA;
-      }
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_MMI;
-      }
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                     width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-      src_argb4444 += src_stride_argb4444 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
-      ARGB4444ToARGBRow(src_argb4444, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-// Convert RGB24 to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_yj,
-                int dst_stride_yj,
-                int width,
-                int height) {
-  int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
-      RGB24ToYJRow_C;
-#else
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_NEON;
-    }
-  }
-#elif defined(HAS_RGB24TOYJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_MSA;
-    }
-  }
-#elif defined(HAS_RGB24TOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToYJRow = RGB24ToYJRow_MMI;
-    }
-  }
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#endif
-
-  {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-      defined(HAS_RGB24TOYJROW_MMI))
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToYJRow(src_rgb24, dst_yj, width);
-      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToYJRow(row, dst_yj, width);
-      ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_yj += dst_stride_yj * 2;
-    }
-    if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-     defined(HAS_RGB24TOYJROW_MMI))
-      RGB24ToYJRow(src_rgb24, dst_yj, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToYJRow(row, dst_yj, width);
-#endif
-    }
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
-      defined(HAS_RGB24TOYJROW_MMI))
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
-}
-
-static void SplitPixels(const uint8_t* src_u,
-                        int src_pixel_stride_uv,
-                        uint8_t* dst_u,
-                        int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    *dst_u = *src_u;
-    ++dst_u;
-    src_u += src_pixel_stride_uv;
-  }
-}
-
-// Convert Android420 to I420.
-LIBYUV_API
-int Android420ToI420(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height) {
-  int y;
-  const ptrdiff_t vu_off = src_v - src_u;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // Copy UV planes as is - I420
-  if (src_pixel_stride_uv == 1) {
-    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-    return 0;
-    // Split UV planes - NV21
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-      src_stride_u == src_stride_v) {
-    SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
-                 halfwidth, halfheight);
-    return 0;
-    // Split UV planes - NV12
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
-    SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
-                 halfwidth, halfheight);
-    return 0;
-  }
-
-  for (y = 0; y < halfheight; ++y) {
-    SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
-    SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
deleted file mode 100644
index 54050333..00000000
--- a/files/source/convert_argb.cc
+++ /dev/null
@@ -1,2371 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_argb.h"
-
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
-#include "libyuv/rotate_argb.h"
-#include "libyuv/row.h"
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB with optional flipping
-LIBYUV_API
-int ARGBCopy(const uint8_t* src_argb,
-             int src_stride_argb,
-             uint8_t* dst_argb,
-             int dst_stride_argb,
-             int width,
-             int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
-            height);
-  return 0;
-}
-
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB.
-LIBYUV_API
-int I420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuJPEGConstants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert 10 bit YUV to ARGB with matrix
-// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
-// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
-                            int src_stride_y,
-                            const uint16_t* src_u,
-                            int src_stride_u,
-                            const uint16_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_ar30,
-                            int dst_stride_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToAR30Row_C;
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-#if defined(HAS_I210TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToAR30Row = I210ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToAR30Row = I210ToAR30Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I010 to AR30.
-LIBYUV_API
-int I010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H010 to AR30.
-LIBYUV_API
-int H010ToAR30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert I010 to AB30.
-LIBYUV_API
-int I010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuI601Constants, width, height);
-}
-
-// Convert H010 to AB30.
-LIBYUV_API
-int H010ToAB30(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                          src_stride_u, dst_ab30, dst_stride_ab30,
-                          &kYvuH709Constants, width, height);
-}
-
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
-                            int src_stride_y,
-                            const uint16_t* src_u,
-                            int src_stride_u,
-                            const uint16_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
-                        const uint16_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I210ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I210TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I210ToARGBRow = I210ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I210TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I210ToARGBRow = I210ToARGBRow_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I010 to ARGB.
-LIBYUV_API
-int I010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I010 to ABGR.
-LIBYUV_API
-int I010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert H010 to ARGB.
-LIBYUV_API
-int H010ToARGB(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvH709Constants, width, height);
-}
-
-// Convert H010 to ABGR.
-LIBYUV_API
-int H010ToABGR(const uint16_t* src_y,
-               int src_stride_y,
-               const uint16_t* src_u,
-               int src_stride_u,
-               const uint16_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuH709Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I444ToARGBRow = I444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants, width, height);
-}
-
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
-                                 int src_stride_y,
-                                 const uint8_t* src_u,
-                                 int src_stride_u,
-                                 const uint8_t* src_v,
-                                 int src_stride_v,
-                                 const uint8_t* src_a,
-                                 int src_stride_a,
-                                 uint8_t* dst_argb,
-                                 int dst_stride_argb,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width,
-                                 int height,
-                                 int attenuate) {
-  int y;
-  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                             const uint8_t* v_buf, const uint8_t* a_buf,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
-                       width);
-    if (attenuate) {
-      ARGBAttenuateRow(dst_argb, dst_argb, width);
-    }
-    dst_argb += dst_stride_argb;
-    src_a += src_stride_a;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 with Alpha to ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_argb,
-                    int dst_stride_argb,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                               src_stride_v, src_a, src_stride_a, dst_argb,
-                               dst_stride_argb, &kYuvI601Constants, width,
-                               height, attenuate);
-}
-
-// Convert I420 with Alpha to ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8_t* src_y,
-                    int src_stride_y,
-                    const uint8_t* src_u,
-                    int src_stride_u,
-                    const uint8_t* src_v,
-                    int src_stride_v,
-                    const uint8_t* src_a,
-                    int src_stride_a,
-                    uint8_t* dst_abgr,
-                    int dst_stride_abgr,
-                    int width,
-                    int height,
-                    int attenuate) {
-  return I420AlphaToARGBMatrix(
-      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
-      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
-      &kYvuI601Constants,  // Use Yvu matrix
-      width, height, attenuate);
-}
-
-// Convert I400 to ARGB.
-LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
-      I400ToARGBRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I400ToARGBRow = I400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I400ToARGBRow = I400ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      I400ToARGBRow = I400ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I400TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I400ToARGBRow = I400ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-  }
-  return 0;
-}
-
-// Convert J400 to ARGB.
-LIBYUV_API
-int J400ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
-      J400ToARGBRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_argb = 0;
-  }
-#if defined(HAS_J400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      J400ToARGBRow = J400ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    J400ToARGBRow = J400ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      J400ToARGBRow = J400ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    J400ToARGBRow = J400ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      J400ToARGBRow = J400ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_J400TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    J400ToARGBRow = J400ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      J400ToARGBRow = J400ToARGBRow_MMI;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    J400ToARGBRow(src_y, dst_argb, width);
-    src_y += src_stride_y;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
-    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
-
-// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
-    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
-
-// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
-    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
-
-// Convert BGRA to ARGB.
-LIBYUV_API
-int BGRAToARGB(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
-}
-
-// Convert ARGB to BGRA (same as BGRAToARGB).
-LIBYUV_API
-int ARGBToBGRA(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
-}
-
-// Convert ABGR to ARGB.
-LIBYUV_API
-int ABGRToARGB(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
-}
-
-// Convert ARGB to ABGR to (same as ABGRToARGB).
-LIBYUV_API
-int ARGBToABGR(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
-}
-
-// Convert RGBA to ARGB.
-LIBYUV_API
-int RGBAToARGB(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
-                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
-}
-
-// Convert RGB24 to ARGB.
-LIBYUV_API
-int RGB24ToARGB(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_argb,
-                int dst_stride_argb,
-                int width,
-                int height) {
-  int y;
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb24 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB24ToARGBRow(src_rgb24, dst_argb, width);
-    src_rgb24 += src_stride_rgb24;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RAW to ARGB.
-LIBYUV_API
-int RAWToARGB(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_argb,
-              int dst_stride_argb,
-              int width,
-              int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_argb = 0;
-  }
-#if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RAWToARGBRow = RAWToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToARGBRow = RAWToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RAWToARGBRow = RAWToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RAWToARGBRow(src_raw, dst_argb, width);
-    src_raw += src_stride_raw;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert RGB565 to ARGB.
-LIBYUV_API
-int RGB565ToARGB(const uint8_t* src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t* dst_argb,
-                 int dst_stride_argb,
-                 int width,
-                 int height) {
-  int y;
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
-                          int width) = RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-  // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_rgb565 = dst_stride_argb = 0;
-  }
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    RGB565ToARGBRow(src_rgb565, dst_argb, width);
-    src_rgb565 += src_stride_rgb565;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB1555 to ARGB.
-LIBYUV_API
-int ARGB1555ToARGB(const uint8_t* src_argb1555,
-                   int src_stride_argb1555,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
-                            int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-  // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb1555 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
-    src_argb1555 += src_stride_argb1555;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert ARGB4444 to ARGB.
-LIBYUV_API
-int ARGB4444ToARGB(const uint8_t* src_argb4444,
-                   int src_stride_argb4444,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
-  int y;
-  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
-                            int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-  // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_argb4444 = dst_stride_argb = 0;
-  }
-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
-    src_argb4444 += src_stride_argb4444;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert AR30 to ARGB.
-LIBYUV_API
-int AR30ToARGB(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_argb = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToARGBRow_C(src_ar30, dst_argb, width);
-    src_ar30 += src_stride_ar30;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert AR30 to ABGR.
-LIBYUV_API
-int AR30ToABGR(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_abgr = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
-    src_ar30 += src_stride_ar30;
-    dst_abgr += dst_stride_abgr;
-  }
-  return 0;
-}
-
-// Convert AR30 to AB30.
-LIBYUV_API
-int AR30ToAB30(const uint8_t* src_ar30,
-               int src_stride_ar30,
-               uint8_t* dst_ab30,
-               int dst_stride_ab30,
-               int width,
-               int height) {
-  int y;
-  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
-    src_stride_ar30 = -src_stride_ar30;
-  }
-  // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_ar30 = dst_stride_ab30 = 0;
-  }
-  for (y = 0; y < height; ++y) {
-    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
-    src_ar30 += src_stride_ar30;
-    dst_ab30 += dst_stride_ab30;
-  }
-  return 0;
-}
-
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_uv,
-                            int src_stride_uv,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*NV12ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_vu,
-                            int src_stride_vu,
-                            uint8_t* dst_argb,
-                            int dst_stride_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*NV21ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToARGBRow = NV21ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert NV12 to ARGB.
-LIBYUV_API
-int NV12ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
-                          dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
-                          dst_stride_argb, &kYuvI601Constants, width, height);
-}
-
-// Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
-// To swap the UV use NV12 instead of NV21.LIBYUV_API
-LIBYUV_API
-int NV12ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_uv,
-               int src_stride_uv,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
-                          dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to ABGR.
-LIBYUV_API
-int NV21ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_vu,
-               int src_stride_vu,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height) {
-  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
-                          dst_stride_abgr, &kYvuI601Constants, width, height);
-}
-
-// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_uv,
-                             int src_stride_uv,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
-  int y;
-  void (*NV12ToRGB24Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
-  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_NV12TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
-
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_vu,
-                             int src_stride_vu,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
-  int y;
-  void (*NV21ToRGB24Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
-  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_NV21TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV21TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert NV12 to RGB24.
-LIBYUV_API
-int NV12ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_uv,
-                int src_stride_uv,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
-                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
-                           width, height);
-}
-
-// Convert NV21 to RGB24.
-LIBYUV_API
-int NV21ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
-                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
-                           width, height);
-}
-
-// Convert NV12 to RAW.
-LIBYUV_API
-int NV12ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_uv,
-              int src_stride_uv,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
-                           dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to RAW.
-LIBYUV_API
-int NV21ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_vu,
-              int src_stride_vu,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
-                           dst_stride_raw, &kYvuI601Constants, width, height);
-}
-
-// Convert NV21 to YUV24
-int NV21ToYUV24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_vu,
-                int src_stride_vu,
-                uint8_t* dst_yuv24,
-                int dst_stride_yuv24,
-                int width,
-                int height) {
-  int y;
-  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
-                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
-  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
-    dst_stride_yuv24 = -dst_stride_yuv24;
-  }
-#if defined(HAS_NV21TOYUV24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV21TOYUV24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
-    dst_yuv24 += dst_stride_yuv24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_vu += src_stride_vu;
-    }
-  }
-  return 0;
-}
-
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
-               int src_stride_m420,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*NV12ToARGBRow)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToARGBRow = NV12ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
-                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb * 2;
-    src_m420 += src_stride_m420 * 3;
-  }
-  if (height & 1) {
-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
-                  &kYuvI601Constants, width);
-  }
-  return 0;
-}
-
-// Convert YUY2 to ARGB.
-LIBYUV_API
-int YUY2ToARGB(const uint8_t* src_yuy2,
-               int src_stride_yuy2,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants, int width) =
-      YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
-    src_stride_yuy2 = -src_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_yuy2 = dst_stride_argb = 0;
-  }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
-// Convert UYVY to ARGB.
-LIBYUV_API
-int UYVYToARGB(const uint8_t* src_uyvy,
-               int src_stride_uyvy,
-               uint8_t* dst_argb,
-               int dst_stride_argb,
-               int width,
-               int height) {
-  int y;
-  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants, int width) =
-      UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
-    src_stride_uyvy = -src_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_uyvy = dst_stride_argb = 0;
-  }
-#if defined(HAS_UYVYTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      UYVYToARGBRow = UYVYToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      UYVYToARGBRow = UYVYToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_UYVYTOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      UYVYToARGBRow = UYVYToARGBRow_MSA;
-    }
-  }
-#endif
-  for (y = 0; y < height; ++y) {
-    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
-    src_uyvy += src_stride_uyvy;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-static void WeavePixels(const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        int src_pixel_stride_uv,
-                        uint8_t* dst_uv,
-                        int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_uv[0] = *src_u;
-    dst_uv[1] = *src_v;
-    dst_uv += 2;
-    src_u += src_pixel_stride_uv;
-    src_v += src_pixel_stride_uv;
-  }
-}
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGBMatrix(const uint8_t* src_y,
-                           int src_stride_y,
-                           const uint8_t* src_u,
-                           int src_stride_u,
-                           const uint8_t* src_v,
-                           int src_stride_v,
-                           int src_pixel_stride_uv,
-                           uint8_t* dst_argb,
-                           int dst_stride_argb,
-                           const struct YuvConstants* yuvconstants,
-                           int width,
-                           int height) {
-  int y;
-  uint8_t* dst_uv;
-  const ptrdiff_t vu_off = src_v - src_u;
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-
-  // I420
-  if (src_pixel_stride_uv == 1) {
-    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_argb, dst_stride_argb,
-                            yuvconstants, width, height);
-    // NV21
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-      src_stride_u == src_stride_v) {
-    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
-                            dst_stride_argb, yuvconstants, width, height);
-    // NV12
-  }
-  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
-    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
-                            dst_stride_argb, yuvconstants, width, height);
-  }
-
-  // General case fallback creates NV12
-  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
-  dst_uv = plane_uv;
-  for (y = 0; y < halfheight; ++y) {
-    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += halfwidth * 2;
-  }
-  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
-                   dst_stride_argb, yuvconstants, width, height);
-  free_aligned_buffer_64(plane_uv);
-  return 0;
-}
-
-// Convert Android420 to ARGB.
-LIBYUV_API
-int Android420ToARGB(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_argb,
-                     int dst_stride_argb,
-                     int width,
-                     int height) {
-  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                                src_stride_v, src_pixel_stride_uv, dst_argb,
-                                dst_stride_argb, &kYuvI601Constants, width,
-                                height);
-}
-
-// Convert Android420 to ABGR.
-LIBYUV_API
-int Android420ToABGR(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_u,
-                     int src_stride_u,
-                     const uint8_t* src_v,
-                     int src_stride_v,
-                     int src_pixel_stride_uv,
-                     uint8_t* dst_abgr,
-                     int dst_stride_abgr,
-                     int width,
-                     int height) {
-  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                                src_stride_u, src_pixel_stride_uv, dst_abgr,
-                                dst_stride_abgr, &kYvuI601Constants, width,
-                                height);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
deleted file mode 100644
index 60140cb4..00000000
--- a/files/source/convert_from.cc
+++ /dev/null
@@ -1,1505 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/convert_from.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/convert.h"  // For I420Copy
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/row.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-#include "libyuv/video_common.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
-// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8_t* src_y,
-                      int src_stride_y,
-                      const uint8_t* src_u,
-                      int src_stride_u,
-                      const uint8_t* src_v,
-                      int src_stride_v,
-                      uint8_t* dst_y,
-                      int dst_stride_y,
-                      uint8_t* dst_u,
-                      int dst_stride_u,
-                      uint8_t* dst_v,
-                      int dst_stride_v,
-                      int src_y_width,
-                      int src_y_height,
-                      int dst_uv_width,
-                      int dst_uv_height) {
-  const int dst_y_width = Abs(src_y_width);
-  const int dst_y_height = Abs(src_y_height);
-  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
-  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
-      dst_uv_height <= 0) {
-    return -1;
-  }
-  if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
-               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
-             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
-             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
-  return 0;
-}
-
-// Convert 8 bit YUV to 10 bit.
-LIBYUV_API
-int I420ToI010(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint16_t* dst_y,
-               int dst_stride_y,
-               uint16_t* dst_u,
-               int dst_stride_u,
-               uint16_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Convert Y plane.
-  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
-                    height);
-  // Convert UV planes.
-  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
-                    halfheight);
-  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
-                    halfheight);
-  return 0;
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 422 chroma is 1/2 width, 1x height
-LIBYUV_API
-int I420ToI422(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int dst_uv_width = (Abs(width) + 1) >> 1;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, dst_uv_width,
-                    dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 444 chroma is 1x width, 1x height
-LIBYUV_API
-int I420ToI444(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  const int dst_uv_width = Abs(width);
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                    dst_v, dst_stride_v, width, height, dst_uv_width,
-                    dst_uv_height);
-}
-
-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
-LIBYUV_API
-int I400Copy(const uint8_t* src_y,
-             int src_stride_y,
-             uint8_t* dst_y,
-             int dst_stride_y,
-             int width,
-             int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  return 0;
-}
-
-LIBYUV_API
-int I422ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToYUY2(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-#if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToYUY2Row = I422ToYUY2Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToYUY2Row = I422ToYUY2Row_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToYUY2Row = I422ToYUY2Row_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
-                  dst_yuy2 + dst_stride_yuy2, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_yuy2 += dst_stride_yuy2 * 2;
-  }
-  if (height & 1) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I422ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int I420ToUYVY(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  int y;
-  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
-      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
-    dst_stride_uyvy = -dst_stride_uyvy;
-  }
-#if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToUYVYRow = I422ToUYVYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToUYVYRow = I422ToUYVYRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
-                  dst_uyvy + dst_stride_uyvy, width);
-    src_y += src_stride_y * 2;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uyvy += dst_stride_uyvy * 2;
-  }
-  if (height & 1) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
-  }
-  return 0;
-}
-
-// TODO(fbarchard): test negative height for invert.
-LIBYUV_API
-int I420ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  int halfwidth = (width + 1) / 2;
-  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
-               halfwidth, halfheight);
-  return 0;
-}
-
-LIBYUV_API
-int I420ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
-                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
-                    width, height);
-}
-
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_rgba,
-                            int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
-                             int src_stride_y,
-                             const uint8_t* src_u,
-                             int src_stride_u,
-                             const uint8_t* src_v,
-                             int src_stride_v,
-                             uint8_t* dst_rgb24,
-                             int dst_stride_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width,
-                             int height) {
-  int y;
-  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                         const uint8_t* v_buf, uint8_t* rgb_buf,
-                         const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_rgb24 = -dst_stride_rgb24;
-  }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB24Row = I422ToRGB24Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB24Row = I422ToRGB24Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
-    dst_rgb24 += dst_stride_rgb24;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuI601Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                           src_stride_v, dst_rgb24, dst_stride_rgb24,
-                           &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
-                           src_stride_v,  // Swap U and V
-                           src_u, src_stride_u, dst_raw, dst_stride_raw,
-                           &kYvuH709Constants,  // Use Yvu matrix
-                           width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
-    dst_stride_argb1555 = -dst_stride_argb1555;
-  }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
-                      width);
-    dst_argb1555 += dst_stride_argb1555;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height) {
-  int y;
-  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* v_buf, uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
-      height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
-    dst_stride_argb4444 = -dst_stride_argb4444;
-  }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
-                      width);
-    dst_argb4444 += dst_stride_argb4444;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const struct YuvConstants* yuvconstants,
-                       int width,
-                       int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvI601Constants, width, height);
-}
-
-// Convert J420 to RGB565.
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvJPEGConstants, width, height);
-}
-
-// Convert H420 to RGB565.
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_rgb565, dst_stride_rgb565,
-                            &kYuvH709Constants, width, height);
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                          const uint8_t* v_buf, uint8_t* rgb_buf,
-                          const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGB565Row = I422ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGB565Row = I422ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
-    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height) {
-  int y;
-  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
-      ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-  if (!dither4x4) {
-    dither4x4 = kDither565_4x4;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToARGBRow = I422ToARGBRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
-    }
-  }
-#endif
-  {
-    // Allocate a row of argb.
-    align_buffer_64(row_argb, width * 4);
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
-      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
-                            width);
-      dst_rgb565 += dst_stride_rgb565;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row_argb);
-  }
-  return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_ar30,
-                            int dst_stride_ar30,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToAR30Row_C;
-
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
-    dst_stride_ar30 = -dst_stride_ar30;
-  }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToAR30Row = I422ToAR30Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToAR30Row = I422ToAR30Row_AVX2;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
-    dst_ar30 += dst_stride_ar30;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height) {
-  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_ar30, dst_stride_ar30,
-                          &kYvuH709Constants, width, height);
-}
-
-// Convert I420 to specified format
-LIBYUV_API
-int ConvertFromI420(const uint8_t* y,
-                    int y_stride,
-                    const uint8_t* u,
-                    int u_stride,
-                    const uint8_t* v,
-                    int v_stride,
-                    uint8_t* dst_sample,
-                    int dst_sample_stride,
-                    int width,
-                    int height,
-                    uint32_t fourcc) {
-  uint32_t format = CanonicalFourCC(fourcc);
-  int r = 0;
-  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
-    return -1;
-  }
-  switch (format) {
-    // Single plane formats
-    case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2, width,
-                     height);
-      break;
-    case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2, width,
-                     height);
-      break;
-    case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2, width,
-                       height);
-      break;
-    case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                         dst_sample_stride ? dst_sample_stride : width * 2,
-                         width, height);
-      break;
-    case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3, width,
-                      height);
-      break;
-    case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3, width,
-                    height);
-      break;
-    case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_AR30:
-      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4, width,
-                     height);
-      break;
-    case FOURCC_I400:
-      r = I400Copy(y, y_stride, dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width, width,
-                   height);
-      break;
-    case FOURCC_NV12: {
-      uint8_t* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width, width,
-                     height);
-      break;
-    }
-    case FOURCC_NV21: {
-      uint8_t* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width, width,
-                     height);
-      break;
-    }
-    // TODO(fbarchard): Add M420.
-    // Triplanar formats
-    case FOURCC_I420:
-    case FOURCC_YV12: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      int halfstride = (dst_sample_stride + 1) / 2;
-      int halfheight = (height + 1) / 2;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV12) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + halfstride * halfheight;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + halfstride * halfheight;
-      }
-      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
-                   width, height);
-      break;
-    }
-    case FOURCC_I422:
-    case FOURCC_YV16: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      int halfstride = (dst_sample_stride + 1) / 2;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV16) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + halfstride * height;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + halfstride * height;
-      }
-      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
-                     width, height);
-      break;
-    }
-    case FOURCC_I444:
-    case FOURCC_YV24: {
-      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_u;
-      uint8_t* dst_v;
-      if (format == FOURCC_YV24) {
-        dst_v = dst_sample + dst_sample_stride * height;
-        dst_u = dst_v + dst_sample_stride * height;
-      } else {
-        dst_u = dst_sample + dst_sample_stride * height;
-        dst_v = dst_u + dst_sample_stride * height;
-      }
-      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
-                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
-                     dst_sample_stride, width, height);
-      break;
-    }
-    // Formats not supported - MJPG, biplanar, some rgb formats.
-    default:
-      return -1;  // unknown fourcc - return failure code.
-  }
-  return r;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
deleted file mode 100644
index d414186a..00000000
--- a/files/source/rotate.cc
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate.h"
-
-#include "libyuv/convert.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-LIBYUV_API
-void TransposePlane(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  int i = height;
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
-                        int dst_stride, int width) = TransposeWx16_C;
-#else
-  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
-                       int dst_stride, int width) = TransposeWx8_C;
-#endif
-#if defined(HAS_TRANSPOSEWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeWx8 = TransposeWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeWx8 = TransposeWx8_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    TransposeWx8 = TransposeWx8_MMI;
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx8 = TransposeWx8_Fast_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeWx16 = TransposeWx16_Any_MSA;
-    if (IS_ALIGNED(width, 16)) {
-      TransposeWx16 = TransposeWx16_MSA;
-    }
-  }
-#endif
-
-#if defined(HAS_TRANSPOSEWX16_MSA)
-  // Work across the source in 16x16 tiles
-  while (i >= 16) {
-    TransposeWx16(src, src_stride, dst, dst_stride, width);
-    src += 16 * src_stride;  // Go down 16 rows.
-    dst += 16;               // Move over 16 columns.
-    i -= 16;
-  }
-#else
-  // Work across the source in 8x8 tiles
-  while (i >= 8) {
-    TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;  // Go down 8 rows.
-    dst += 8;               // Move over 8 columns.
-    i -= 8;
-  }
-#endif
-
-  if (i > 0) {
-    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
-  }
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8_t* src,
-                   int src_stride,
-                   uint8_t* dst,
-                   int dst_stride,
-                   int width,
-                   int height) {
-  // Rotate by 90 is a transpose with the source read
-  // from bottom to top. So set the source pointer to the end
-  // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  // Rotate by 270 is a transpose with the destination written
-  // from bottom to top. So set the destination pointer to the end
-  // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width);
-  const uint8_t* src_bot = src + src_stride * (height - 1);
-  uint8_t* dst_bot = dst + dst_stride * (height - 1);
-  int half_height = (height + 1) >> 1;
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
-    }
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_AVX)
-  if (TestCpuFlag(kCpuHasAVX)) {
-    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
-  }
-#endif
-
-  // Odd height will harmlessly mirror the middle row twice.
-  for (y = 0; y < half_height; ++y) {
-    MirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    dst += dst_stride;
-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
-  }
-  free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-void TransposeUV(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  int i = height;
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
-                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
-                          int width) = TransposeUVWx16_C;
-#else
-  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
-                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
-                         int width) = TransposeUVWx8_C;
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    TransposeUVWx8 = TransposeUVWx8_NEON;
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx8 = TransposeUVWx8_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX8_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    TransposeUVWx8 = TransposeUVWx8_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      TransposeUVWx8 = TransposeUVWx8_MMI;
-    }
-  }
-#endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      TransposeUVWx16 = TransposeUVWx16_MSA;
-    }
-  }
-#endif
-
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
-  // Work through the source in 8x8 tiles.
-  while (i >= 16) {
-    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                    width);
-    src += 16 * src_stride;  // Go down 16 rows.
-    dst_a += 16;             // Move over 8 columns.
-    dst_b += 16;             // Move over 8 columns.
-    i -= 16;
-  }
-#else
-  // Work through the source in 8x8 tiles.
-  while (i >= 8) {
-    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                   width);
-    src += 8 * src_stride;  // Go down 8 rows.
-    dst_a += 8;             // Move over 8 columns.
-    dst_b += 8;             // Move over 8 columns.
-    i -= 8;
-  }
-#endif
-
-  if (i > 0) {
-    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
-                     width, i);
-  }
-}
-
-LIBYUV_API
-void RotateUV90(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst_a,
-                int dst_stride_a,
-                uint8_t* dst_b,
-                int dst_stride_b,
-                int width,
-                int height) {
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-
-  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
-              height);
-}
-
-LIBYUV_API
-void RotateUV270(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  dst_a += dst_stride_a * (width - 1);
-  dst_b += dst_stride_b * (width - 1);
-  dst_stride_a = -dst_stride_a;
-  dst_stride_b = -dst_stride_b;
-
-  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
-              height);
-}
-
-// Rotate 180 is a horizontal and vertical flip.
-LIBYUV_API
-void RotateUV180(const uint8_t* src,
-                 int src_stride,
-                 uint8_t* dst_a,
-                 int dst_stride_a,
-                 uint8_t* dst_b,
-                 int dst_stride_b,
-                 int width,
-                 int height) {
-  int i;
-  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
-                      int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    MirrorUVRow = MirrorUVRow_NEON;
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
-    MirrorUVRow = MirrorUVRow_SSSE3;
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
-    MirrorUVRow = MirrorUVRow_MSA;
-  }
-#endif
-#if defined(HAS_MIRRORUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
-    MirrorUVRow = MirrorUVRow_MMI;
-  }
-#endif
-
-  dst_a += dst_stride_a * (height - 1);
-  dst_b += dst_stride_b * (height - 1);
-
-  for (i = 0; i < height; ++i) {
-    MirrorUVRow(src, dst_a, dst_b, width);
-    src += src_stride;
-    dst_a -= dst_stride_a;
-    dst_b -= dst_stride_b;
-  }
-}
-
-LIBYUV_API
-int RotatePlane(const uint8_t* src,
-                int src_stride,
-                uint8_t* dst,
-                int dst_stride,
-                int width,
-                int height,
-                enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || !dst) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src = src + (height - 1) * src_stride;
-    src_stride = -src_stride;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      CopyPlane(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate90:
-      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int I420Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
-      !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                      dst_v, dst_stride_v, width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                    halfheight);
-      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                    halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                     halfheight);
-      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                     halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                     halfheight);
-      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                     halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int I444Rotate(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum libyuv::RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
-      !dst_u || !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  switch (mode) {
-    case libyuv::kRotate0:
-      // copy frame
-      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    case libyuv::kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
-      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-LIBYUV_API
-int NV12ToI420Rotate(const uint8_t* src_y,
-                     int src_stride_y,
-                     const uint8_t* src_uv,
-                     int src_stride_uv,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_u,
-                     int dst_stride_u,
-                     uint8_t* dst_v,
-                     int dst_stride_v,
-                     int width,
-                     int height,
-                     enum RotationMode mode) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
-      !dst_v) {
-    return -1;
-  }
-
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
-    src_stride_y = -src_stride_y;
-    src_stride_uv = -src_stride_uv;
-  }
-
-  switch (mode) {
-    case kRotate0:
-      // copy frame
-      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
-                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-                        width, height);
-    case kRotate90:
-      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                 dst_stride_v, halfwidth, halfheight);
-      return 0;
-    case kRotate270:
-      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                  dst_stride_v, halfwidth, halfheight);
-      return 0;
-    case kRotate180:
-      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
-                  dst_stride_v, halfwidth, halfheight);
-      return 0;
-    default:
-      break;
-  }
-  return -1;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
deleted file mode 100644
index ff212ade..00000000
--- a/files/source/rotate_common.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-void TransposeWx8_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-void TransposeUVWx8_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-void TransposeWxH_C(const uint8_t* src,
-                    int src_stride,
-                    uint8_t* dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
-void TransposeUVWxH_C(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst_a,
-                      int dst_stride_a,
-                      uint8_t* dst_b,
-                      int dst_stride_b,
-                      int width,
-                      int height) {
-  int i;
-  for (i = 0; i < width * 2; i += 2) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_dspr2.cc b/files/source/rotate_dspr2.cc
deleted file mode 100644
index 5d2338de..00000000
--- a/files/source/rotate_dspr2.cc
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src,
-                        int src_stride,
-                        uint8* dst,
-                        int dst_stride,
-                        int width) {
-  __asm__ __volatile__(
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "sll              $t2, %[src_stride], 0x1          \n"  // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-      // dst + dst_stride word aligned
-      "1:                                                \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "sw               $s0, 0(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "sw               $s1, 4(%[dst])                   \n"
-      "bnez             %[width], 1b                     \n"
-      " addu            %[dst], %[dst], %[dst_stride]    \n"
-      "b                2f                               \n"
-      // dst + dst_stride unaligned
-      "11:                                               \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "swr              $s0, 0(%[dst])                   \n"
-      "swl              $s0, 3(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "swr              $s1, 4(%[dst])                   \n"
-      "swl              $s1, 7(%[dst])                   \n"
-      "bnez             %[width], 11b                    \n"
-      "addu             %[dst], %[dst], %[dst_stride]    \n"
-      "2:                                                \n"
-      ".set pop                                          \n"
-      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
-      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src,
-                             int src_stride,
-                             uint8* dst,
-                             int dst_stride,
-                             int width) {
-  __asm__ __volatile__(
-      ".set noat                                         \n"
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz             %[width], 2f                     \n"
-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-
-      "srl              $AT, %[width], 0x2               \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-      // dst + dst_stride word aligned
-      "1:                                                \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-      // t0 = | 30 | 20 | 10 | 00 |
-      // t1 = | 31 | 21 | 11 | 01 |
-      // t8 = | 32 | 22 | 12 | 02 |
-      // t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-      // s0 = | 21 | 01 | 20 | 00 |
-      // s1 = | 23 | 03 | 22 | 02 |
-      // s2 = | 31 | 11 | 30 | 10 |
-      // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-      // s4 = | 03 | 02 | 01 | 00 |
-      // s5 = | 23 | 22 | 21 | 20 |
-      // s6 = | 13 | 12 | 11 | 10 |
-      // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-      // t0 = | 34 | 24 | 14 | 04 |
-      // t1 = | 35 | 25 | 15 | 05 |
-      // t8 = | 36 | 26 | 16 | 06 |
-      // t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-      // s0 = | 25 | 05 | 24 | 04 |
-      // s1 = | 27 | 07 | 26 | 06 |
-      // s2 = | 35 | 15 | 34 | 14 |
-      // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-      // t0 = | 07 | 06 | 05 | 04 |
-      // t1 = | 27 | 26 | 25 | 24 |
-      // t8 = | 17 | 16 | 15 | 14 |
-      // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "sw              $s4, 0(%[dst])                    \n"
-      "sw              $t0, 4(%[dst])                    \n"
-      "sw              $s6, 0($s0)                       \n"
-      "sw              $t8, 4($s0)                       \n"
-      "sw              $s5, 0($s1)                       \n"
-      "sw              $t1, 4($s1)                       \n"
-      "sw              $s7, 0($s2)                       \n"
-      "sw              $t9, 4($s2)                       \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 1b                          \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "b                2f                               \n"
-      // dst + dst_stride unaligned
-      "11:                                               \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-      // t0 = | 30 | 20 | 10 | 00 |
-      // t1 = | 31 | 21 | 11 | 01 |
-      // t8 = | 32 | 22 | 12 | 02 |
-      // t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-      // s0 = | 21 | 01 | 20 | 00 |
-      // s1 = | 23 | 03 | 22 | 02 |
-      // s2 = | 31 | 11 | 30 | 10 |
-      // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-      // s4 = | 03 | 02 | 01 | 00 |
-      // s5 = | 23 | 22 | 21 | 20 |
-      // s6 = | 13 | 12 | 11 | 10 |
-      // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-      // t0 = | 34 | 24 | 14 | 04 |
-      // t1 = | 35 | 25 | 15 | 05 |
-      // t8 = | 36 | 26 | 16 | 06 |
-      // t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-      // s0 = | 25 | 05 | 24 | 04 |
-      // s1 = | 27 | 07 | 26 | 06 |
-      // s2 = | 35 | 15 | 34 | 14 |
-      // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-      // t0 = | 07 | 06 | 05 | 04 |
-      // t1 = | 27 | 26 | 25 | 24 |
-      // t8 = | 17 | 16 | 15 | 14 |
-      // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "swr              $s4, 0(%[dst])                   \n"
-      "swl              $s4, 3(%[dst])                   \n"
-      "swr              $t0, 4(%[dst])                   \n"
-      "swl              $t0, 7(%[dst])                   \n"
-      "swr              $s6, 0($s0)                      \n"
-      "swl              $s6, 3($s0)                      \n"
-      "swr              $t8, 4($s0)                      \n"
-      "swl              $t8, 7($s0)                      \n"
-      "swr              $s5, 0($s1)                      \n"
-      "swl              $s5, 3($s1)                      \n"
-      "swr              $t1, 4($s1)                      \n"
-      "swl              $t1, 7($s1)                      \n"
-      "swr              $s7, 0($s2)                      \n"
-      "swl              $s7, 3($s2)                      \n"
-      "swr              $t9, 4($s2)                      \n"
-      "swl              $t9, 7($s2)                      \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 11b                         \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "2:                                                \n"
-      ".set pop                                          \n"
-      ".set at                                           \n"
-      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
-      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
-        "s2", "s3", "s4", "s5", "s6", "s7");
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src,
-                          int src_stride,
-                          uint8* dst_a,
-                          int dst_stride_a,
-                          uint8* dst_b,
-                          int dst_stride_b,
-                          int width) {
-  __asm__ __volatile__(
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz            %[width], 2f                      \n"
-      " sll            $t2, %[src_stride], 0x1           \n"  // src_stride x 2
-      "sll             $t4, %[src_stride], 0x2           \n"  // src_stride x 4
-      "sll             $t9, %[src_stride], 0x3           \n"  // src_stride x 8
-      "addu            $t3, $t2, %[src_stride]           \n"
-      "addu            $t5, $t4, %[src_stride]           \n"
-      "addu            $t6, $t2, $t4                     \n"
-      "subu            $t7, $t9, %[src_stride]           \n"
-      "srl             $t1, %[width], 1                  \n"
-
-      // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
-      "andi            $t0, %[dst_a], 0x3                \n"
-      "andi            $t8, %[dst_b], 0x3                \n"
-      "or              $t0, $t0, $t8                     \n"
-      "andi            $t8, %[dst_stride_a], 0x3         \n"
-      "andi            $s5, %[dst_stride_b], 0x3         \n"
-      "or              $t8, $t8, $s5                     \n"
-      "or              $t0, $t0, $t8                     \n"
-      "bnez            $t0, 11f                          \n"
-      " nop                                              \n"
-      // dst + dst_stride word aligned (both, a & b dst addresses)
-      "1:                                                \n"
-      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
-
-      "sw              $s3, 0($s5)                       \n"
-      "sw              $s4, 0($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
-      "sw              $s3, 0(%[dst_a])                  \n"
-      "sw              $s4, 0(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
-      "sw              $s3, 4($s5)                       \n"
-      "sw              $s4, 4($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "sw              $s3, 4(%[dst_a])                  \n"
-      "sw              $s4, 4(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 1b                           \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-      "b               2f                                \n"
-      " nop                                              \n"
-
-      // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
-      "11:                                               \n"
-      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
-
-      "swr             $s3, 0($s5)                       \n"
-      "swl             $s3, 3($s5)                       \n"
-      "swr             $s4, 0($s6)                       \n"
-      "swl             $s4, 3($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
-      "swr             $s3, 0(%[dst_a])                  \n"
-      "swl             $s3, 3(%[dst_a])                  \n"
-      "swr             $s4, 0(%[dst_b])                  \n"
-      "swl             $s4, 3(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
-
-      "swr             $s3, 4($s5)                       \n"
-      "swl             $s3, 7($s5)                       \n"
-      "swr             $s4, 4($s6)                       \n"
-      "swl             $s4, 7($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "swr             $s3, 4(%[dst_a])                  \n"
-      "swl             $s3, 7(%[dst_a])                  \n"
-      "swr             $s4, 4(%[dst_b])                  \n"
-      "swl             $s4, 7(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 11b                          \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-
-      "2:                                                \n"
-      ".set pop                                          \n"
-      : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
-        [width] "+r"(width), [src_stride] "+r"(src_stride)
-      : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
-        "s2", "s3", "s4", "s5", "s6");
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
deleted file mode 100644
index 04e19e29..00000000
--- a/files/source/rotate_gcc.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst,
-                        int dst_stride,
-                        int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                          \n"
-      "movq       (%0),%%xmm0                      \n"
-      "movq       (%0,%3),%%xmm1                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "movq       (%0),%%xmm2                      \n"
-      "movdqa     %%xmm0,%%xmm1                    \n"
-      "palignr    $0x8,%%xmm1,%%xmm1               \n"
-      "movq       (%0,%3),%%xmm3                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm3                    \n"
-      "movq       (%0),%%xmm4                      \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "movq       (%0,%3),%%xmm5                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "movdqa     %%xmm4,%%xmm5                    \n"
-      "movq       (%0),%%xmm6                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       (%0,%3),%%xmm7                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "neg        %3                               \n"
-      "movdqa     %%xmm6,%%xmm7                    \n"
-      "lea        0x8(%0,%3,8),%0                  \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "neg        %3                               \n"
-      // Second round of bit swap.
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm0,%%xmm2                    \n"
-      "movdqa     %%xmm1,%%xmm3                    \n"
-      "palignr    $0x8,%%xmm2,%%xmm2               \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm4,%%xmm6                    \n"
-      "movdqa     %%xmm5,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movq       %%xmm0,(%1)                      \n"
-      "movdqa     %%xmm0,%%xmm4                    \n"
-      "palignr    $0x8,%%xmm4,%%xmm4               \n"
-      "movq       %%xmm4,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm6                    \n"
-      "movq       %%xmm2,(%1)                      \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movq       %%xmm6,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm1,%%xmm5                    \n"
-      "movq       %%xmm1,(%1)                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       %%xmm5,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movq       %%xmm3,(%1)                      \n"
-      "movdqa     %%xmm3,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "sub        $0x8,%2                          \n"
-      "movq       %%xmm7,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "jg         1b                               \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
-
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8_t* src,
-                             int src_stride,
-                             uint8_t* dst,
-                             int dst_stride,
-                             int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                          \n"
-      "movdqu     (%0),%%xmm0                      \n"
-      "movdqu     (%0,%3),%%xmm1                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "punpckhbw  %%xmm1,%%xmm8                    \n"
-      "movdqu     (%0),%%xmm2                      \n"
-      "movdqa     %%xmm0,%%xmm1                    \n"
-      "movdqa     %%xmm8,%%xmm9                    \n"
-      "palignr    $0x8,%%xmm1,%%xmm1               \n"
-      "palignr    $0x8,%%xmm9,%%xmm9               \n"
-      "movdqu     (%0,%3),%%xmm3                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm2,%%xmm10                   \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "punpckhbw  %%xmm3,%%xmm10                   \n"
-      "movdqa     %%xmm2,%%xmm3                    \n"
-      "movdqa     %%xmm10,%%xmm11                  \n"
-      "movdqu     (%0),%%xmm4                      \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "palignr    $0x8,%%xmm11,%%xmm11             \n"
-      "movdqu     (%0,%3),%%xmm5                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm4,%%xmm12                   \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "punpckhbw  %%xmm5,%%xmm12                   \n"
-      "movdqa     %%xmm4,%%xmm5                    \n"
-      "movdqa     %%xmm12,%%xmm13                  \n"
-      "movdqu     (%0),%%xmm6                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "palignr    $0x8,%%xmm13,%%xmm13             \n"
-      "movdqu     (%0,%3),%%xmm7                   \n"
-      "lea        (%0,%3,2),%0                     \n"
-      "movdqa     %%xmm6,%%xmm14                   \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "punpckhbw  %%xmm7,%%xmm14                   \n"
-      "neg        %3                               \n"
-      "movdqa     %%xmm6,%%xmm7                    \n"
-      "movdqa     %%xmm14,%%xmm15                  \n"
-      "lea        0x10(%0,%3,8),%0                 \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
-      "neg        %3                               \n"
-      // Second round of bit swap.
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm0,%%xmm2                    \n"
-      "movdqa     %%xmm1,%%xmm3                    \n"
-      "palignr    $0x8,%%xmm2,%%xmm2               \n"
-      "palignr    $0x8,%%xmm3,%%xmm3               \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm4,%%xmm6                    \n"
-      "movdqa     %%xmm5,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "punpcklwd  %%xmm10,%%xmm8                   \n"
-      "punpcklwd  %%xmm11,%%xmm9                   \n"
-      "movdqa     %%xmm8,%%xmm10                   \n"
-      "movdqa     %%xmm9,%%xmm11                   \n"
-      "palignr    $0x8,%%xmm10,%%xmm10             \n"
-      "palignr    $0x8,%%xmm11,%%xmm11             \n"
-      "punpcklwd  %%xmm14,%%xmm12                  \n"
-      "punpcklwd  %%xmm15,%%xmm13                  \n"
-      "movdqa     %%xmm12,%%xmm14                  \n"
-      "movdqa     %%xmm13,%%xmm15                  \n"
-      "palignr    $0x8,%%xmm14,%%xmm14             \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movq       %%xmm0,(%1)                      \n"
-      "movdqa     %%xmm0,%%xmm4                    \n"
-      "palignr    $0x8,%%xmm4,%%xmm4               \n"
-      "movq       %%xmm4,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movdqa     %%xmm2,%%xmm6                    \n"
-      "movq       %%xmm2,(%1)                      \n"
-      "palignr    $0x8,%%xmm6,%%xmm6               \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movq       %%xmm6,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm1,%%xmm5                    \n"
-      "movq       %%xmm1,(%1)                      \n"
-      "palignr    $0x8,%%xmm5,%%xmm5               \n"
-      "movq       %%xmm5,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movq       %%xmm3,(%1)                      \n"
-      "movdqa     %%xmm3,%%xmm7                    \n"
-      "palignr    $0x8,%%xmm7,%%xmm7               \n"
-      "movq       %%xmm7,(%1,%4)                   \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm12,%%xmm8                   \n"
-      "movq       %%xmm8,(%1)                      \n"
-      "movdqa     %%xmm8,%%xmm12                   \n"
-      "palignr    $0x8,%%xmm12,%%xmm12             \n"
-      "movq       %%xmm12,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm14,%%xmm10                  \n"
-      "movdqa     %%xmm10,%%xmm14                  \n"
-      "movq       %%xmm10,(%1)                     \n"
-      "palignr    $0x8,%%xmm14,%%xmm14             \n"
-      "punpckldq  %%xmm13,%%xmm9                   \n"
-      "movq       %%xmm14,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "movdqa     %%xmm9,%%xmm13                   \n"
-      "movq       %%xmm9,(%1)                      \n"
-      "palignr    $0x8,%%xmm13,%%xmm13             \n"
-      "movq       %%xmm13,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "punpckldq  %%xmm15,%%xmm11                  \n"
-      "movq       %%xmm11,(%1)                     \n"
-      "movdqa     %%xmm11,%%xmm15                  \n"
-      "palignr    $0x8,%%xmm15,%%xmm15             \n"
-      "sub        $0x10,%2                         \n"
-      "movq       %%xmm15,(%1,%4)                  \n"
-      "lea        (%1,%4,2),%1                     \n"
-      "jg         1b                               \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "r"((intptr_t)(dst_stride))   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-        "xmm15");
-}
-#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-// Transpose UV 8x8.  64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8_t* src,
-                         int src_stride,
-                         uint8_t* dst_a,
-                         int dst_stride_a,
-                         uint8_t* dst_b,
-                         int dst_stride_b,
-                         int width) {
-  asm volatile(
-      // Read in the data from the source pointer.
-      // First round of bit swap.
-      LABELALIGN
-      "1:                                          \n"
-      "movdqu     (%0),%%xmm0                      \n"
-      "movdqu     (%0,%4),%%xmm1                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpcklbw  %%xmm1,%%xmm0                    \n"
-      "punpckhbw  %%xmm1,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm1                    \n"
-      "movdqu     (%0),%%xmm2                      \n"
-      "movdqu     (%0,%4),%%xmm3                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm2,%%xmm8                    \n"
-      "punpcklbw  %%xmm3,%%xmm2                    \n"
-      "punpckhbw  %%xmm3,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm3                    \n"
-      "movdqu     (%0),%%xmm4                      \n"
-      "movdqu     (%0,%4),%%xmm5                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm4,%%xmm8                    \n"
-      "punpcklbw  %%xmm5,%%xmm4                    \n"
-      "punpckhbw  %%xmm5,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm5                    \n"
-      "movdqu     (%0),%%xmm6                      \n"
-      "movdqu     (%0,%4),%%xmm7                   \n"
-      "lea        (%0,%4,2),%0                     \n"
-      "movdqa     %%xmm6,%%xmm8                    \n"
-      "punpcklbw  %%xmm7,%%xmm6                    \n"
-      "neg        %4                               \n"
-      "lea        0x10(%0,%4,8),%0                 \n"
-      "punpckhbw  %%xmm7,%%xmm8                    \n"
-      "movdqa     %%xmm8,%%xmm7                    \n"
-      "neg        %4                               \n"
-      // Second round of bit swap.
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "movdqa     %%xmm1,%%xmm9                    \n"
-      "punpckhwd  %%xmm2,%%xmm8                    \n"
-      "punpckhwd  %%xmm3,%%xmm9                    \n"
-      "punpcklwd  %%xmm2,%%xmm0                    \n"
-      "punpcklwd  %%xmm3,%%xmm1                    \n"
-      "movdqa     %%xmm8,%%xmm2                    \n"
-      "movdqa     %%xmm9,%%xmm3                    \n"
-      "movdqa     %%xmm4,%%xmm8                    \n"
-      "movdqa     %%xmm5,%%xmm9                    \n"
-      "punpckhwd  %%xmm6,%%xmm8                    \n"
-      "punpckhwd  %%xmm7,%%xmm9                    \n"
-      "punpcklwd  %%xmm6,%%xmm4                    \n"
-      "punpcklwd  %%xmm7,%%xmm5                    \n"
-      "movdqa     %%xmm8,%%xmm6                    \n"
-      "movdqa     %%xmm9,%%xmm7                    \n"
-      // Third round of bit swap.
-      // Write to the destination pointer.
-      "movdqa     %%xmm0,%%xmm8                    \n"
-      "punpckldq  %%xmm4,%%xmm0                    \n"
-      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-      "punpckhdq  %%xmm4,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm2,%%xmm8                    \n"
-      "punpckldq  %%xmm6,%%xmm2                    \n"
-      "movlpd     %%xmm2,(%1)                      \n"
-      "movhpd     %%xmm2,(%2)                      \n"
-      "punpckhdq  %%xmm6,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm1,%%xmm8                    \n"
-      "punpckldq  %%xmm5,%%xmm1                    \n"
-      "movlpd     %%xmm1,(%1)                      \n"
-      "movhpd     %%xmm1,(%2)                      \n"
-      "punpckhdq  %%xmm5,%%xmm8                    \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "movdqa     %%xmm3,%%xmm8                    \n"
-      "punpckldq  %%xmm7,%%xmm3                    \n"
-      "movlpd     %%xmm3,(%1)                      \n"
-      "movhpd     %%xmm3,(%2)                      \n"
-      "punpckhdq  %%xmm7,%%xmm8                    \n"
-      "sub        $0x8,%3                          \n"
-      "movlpd     %%xmm8,(%1,%5)                   \n"
-      "lea        (%1,%5,2),%1                     \n"
-      "movhpd     %%xmm8,(%2,%6)                   \n"
-      "lea        (%2,%6,2),%2                     \n"
-      "jg         1b                               \n"
-      : "+r"(src),                      // %0
-        "+r"(dst_a),                    // %1
-        "+r"(dst_b),                    // %2
-        "+r"(width)                     // %3
-      : "r"((intptr_t)(src_stride)),    // %4
-        "r"((intptr_t)(dst_stride_a)),  // %5
-        "r"((intptr_t)(dst_stride_b))   // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm8", "xmm9");
-}
-#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc
deleted file mode 100644
index f8de6083..00000000
--- a/files/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (00 10 01 11 02 12 03 13) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (04 14 05 15 06 16 07 17) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (20 30 21 31 22 32 23 33) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (24 34 25 35 26 36 27 37) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (00 10 20 30 01 11 21 31) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (02 12 22 32 03 13 23 33) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (04 14 24 34 05 15 25 35) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (06 16 26 36 07 17 27 37) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (40 50 41 51 42 52 43 53) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (44 54 45 55 46 56 47 57) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (60 70 61 71 62 72 63 73) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (64 74 65 75 66 76 67 77) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (40 50 60 70 41 51 61 71) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (42 52 62 72 43 53 63 73) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (44 54 64 74 45 55 65 75) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (46 56 66 76 47 57 67 77) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (00 10 20 30 40 50 60 70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (01 11 21 31 41 51 61 71) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (02 12 22 32 42 52 62 72) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (03 13 23 33 43 53 63 73) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (04 14 24 34 44 54 64 74) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (05 15 25 35 45 55 65 75) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (06 16 26 36 46 56 66 76) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (07 17 27 37 47 57 67 77) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "daddi      %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x08            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
-        [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
-        [dst_stride] "r"(dst_stride)
-      : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
-
-      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
-      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "daddiu     %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x04            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
-        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
-        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
deleted file mode 100644
index 06ca723a..00000000
--- a/files/source/row_any.cc
+++ /dev/null
@@ -1,1429 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <string.h>  // For memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// memset for temp is meant to clear the source buffer (not dest) so that
-// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
-// memset is not needed for production, as the garbage values are processed but
-// not used, although there may be edge cases for subsampling.
-// The size of the buffer is based on the largest read, which can be inferred
-// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
-// the source code for how much the source pointers are advanced.
-
-// Subsampled source needs to be increase by 1 of not even.
-#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-
-// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
-               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                  \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-    }                                                                        \
-    memcpy(temp, y_buf + n, r);                                              \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-    memcpy(temp + 192, a_buf + n, r);                                        \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-             yuvconstants, MASK + 1);                                        \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-           SS(r, DUVSHIFT) * BPP);                                           \
-  }
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_MSA
-ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
-#endif
-#undef ANY41C
-
-// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
-               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
-    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
-    }                                                               \
-    memcpy(temp, y_buf + n, r);                                     \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
-           SS(r, DUVSHIFT) * BPP);                                  \
-  }
-
-// Merge functions.
-#ifdef HAS_MERGERGBROW_SSSE3
-ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_NEON
-ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
-#endif
-#ifdef HAS_MERGERGBROW_MMI
-ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
-#endif
-#ifdef HAS_I422TOYUY2ROW_SSE2
-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_AVX2
-ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
-ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MSA
-ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOYUY2ROW_MMI
-ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_MSA
-ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
-#endif
-#ifdef HAS_I422TOUYVYROW_MMI
-ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_AVX2
-ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
-#endif
-#ifdef HAS_BLENDPLANEROW_SSSE3
-ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
-#endif
-#ifdef HAS_BLENDPLANEROW_MMI
-ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
-#endif
-#undef ANY31
-
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
-               const uint8_t* v_buf, uint8_t* dst_ptr,               \
-               const struct YuvConstants* yuvconstants, int width) { \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
-    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
-    }                                                                \
-    memcpy(temp, y_buf + n, r);                                      \
-    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
-    if (width & 1) {                                                 \
-      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
-      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
-    }                                                                \
-    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
-             MASK + 1);                                              \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
-           SS(r, DUVSHIFT) * BPP);                                   \
-  }
-
-#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOAR30ROW_SSSE3
-ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422TOAR30ROW_AVX2
-ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
-#endif  // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
-#endif
-#ifdef HAS_I422TOARGBROW_AVX2
-ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422TORGBAROW_AVX2
-ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I444TOARGBROW_AVX2
-ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
-#endif
-#ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
-#endif
-#ifdef HAS_I422TOARGBROW_NEON
-ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
-#endif
-#ifdef HAS_I422TOARGBROW_MSA
-ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
-ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
-#endif
-#undef ANY31C
-
-// Any 3 planes of 16 bit to 1 with yuvconstants
-// TODO(fbarchard): consider sharing this code with ANY31C
-#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
-  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
-               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
-               int width) {                                               \
-    SIMD_ALIGNED(T temp[16 * 3]);                                         \
-    SIMD_ALIGNED(uint8_t out[64]);                                        \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
-    }                                                                     \
-    memcpy(temp, y_buf + n, r * SBPP);                                    \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
-  }
-
-#ifdef HAS_I210TOAR30ROW_SSSE3
-ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_SSSE3
-ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
-#endif
-#ifdef HAS_I210TOARGBROW_AVX2
-ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#ifdef HAS_I210TOAR30ROW_AVX2
-ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
-#endif
-#undef ANY31CT
-
-// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               int width) {                                                   \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
-    memset(temp, 0, 64 * 2); /* for msan */                                   \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
-    }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
-           SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
-  }
-
-// Merge functions.
-#ifdef HAS_MERGEUVROW_SSE2
-ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
-#endif
-#ifdef HAS_MERGEUVROW_NEON
-ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MSA
-ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
-#endif
-#ifdef HAS_MERGEUVROW_MMI
-ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_NEON
-ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-// Math functions.
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBADDROW_SSE2
-ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_AVX2
-ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_NEON
-ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_NEON
-ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_NEON
-ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MSA
-ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBMULTIPLYROW_MMI
-ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBADDROW_MSA
-ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBADDROW_MMI
-ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MSA
-ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSUBTRACTROW_MMI
-ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
-#endif
-#ifdef HAS_SOBELROW_SSE2
-ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_NEON
-ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELROW_MSA
-ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELROW_MMI
-ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_NEON
-ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MSA
-ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
-#endif
-#ifdef HAS_SOBELTOPLANEROW_MMI
-ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
-#endif
-#ifdef HAS_SOBELXYROW_SSE2
-ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_NEON
-ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
-#endif
-#ifdef HAS_SOBELXYROW_MSA
-ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
-#endif
-#ifdef HAS_SOBELXYROW_MMI
-ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
-#endif
-#undef ANY21
-
-// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {          \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
-    memset(temp, 0, 128 * 2); /* for msan */                                  \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
-    }                                                                         \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
-    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-           SS(r, UVSHIFT) * SBPP2);                                           \
-    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
-    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
-  }
-
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_MSA
-ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_SSSE3
-ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_AVX2
-ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV21TOARGBROW_NEON
-ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_MSA
-ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_NEON
-ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV21TORGB24ROW_NEON
-ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
-#endif
-#ifdef HAS_NV12TORGB24ROW_SSSE3
-ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV21TORGB24ROW_SSSE3
-ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
-#endif
-#ifdef HAS_NV12TORGB24ROW_AVX2
-ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV21TORGB24ROW_AVX2
-ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
-#endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_MSA
-ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
-#endif
-#undef ANY21C
-
-// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
-  }
-
-#ifdef HAS_COPYROW_AVX
-ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
-#endif
-#ifdef HAS_COPYROW_SSE2
-ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
-#endif
-#ifdef HAS_COPYROW_NEON
-ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX2)
-ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORAWROW_AVX2)
-ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
-#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
-ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_SSSE3)
-ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
-#endif
-#if defined(HAS_ABGRTOAR30ROW_AVX2)
-ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_SSE2)
-ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_J400TOARGBROW_AVX2)
-ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
-ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
-ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MSA)
-ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
-ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
-ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
-ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
-ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
-ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MSA)
-ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
-#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
-#endif
-#ifdef HAS_ARGBTOYROW_AVX2
-ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYJROW_AVX2
-ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_AVX2
-ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_AVX2
-ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
-ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
-ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYROW_MSA
-ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYROW_MMI
-ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_NEON
-ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYJROW_MSA
-ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBTOYJROW_MMI
-ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_NEON
-ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_BGRATOYROW_MSA
-ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_BGRATOYROW_MMI
-ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_NEON
-ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MSA
-ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_MMI
-ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_NEON
-ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_MSA
-ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_RGBATOYROW_MMI
-ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_NEON
-ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB24TOYROW_MSA
-ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RGB24TOYROW_MMI
-ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_NEON
-ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RAWTOYROW_MSA
-ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
-#endif
-#ifdef HAS_RAWTOYROW_MMI
-ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_NEON
-ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_RGB565TOYROW_MSA
-ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_RGB565TOYROW_MMI
-ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_NEON
-ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MSA
-ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
-#endif
-#ifdef HAS_ARGB1555TOYROW_MMI
-ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_NEON
-ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
-#endif
-#ifdef HAS_ARGB4444TOYROW_MMI
-ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOYROW_MSA
-ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOYROW_MMI
-ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
-#endif
-#ifdef HAS_UYVYTOYROW_MSA
-ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_UYVYTOYROW_MMI
-ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
-#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_NEON
-ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MSA
-ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RGB24TOARGBROW_MMI
-ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RAWTOARGBROW_NEON
-ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTOARGBROW_MSA
-ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
-#endif
-#ifdef HAS_RAWTOARGBROW_MMI
-ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
-#endif
-#ifdef HAS_RGB565TOARGBROW_NEON
-ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MSA
-ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_RGB565TOARGBROW_MMI
-ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_NEON
-ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MSA
-ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_MMI
-ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_NEON
-ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MSA
-ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_MMI
-ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MSA
-ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_MMI
-ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
-ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
-ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
-ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
-#endif
-#undef ANY11
-
-// Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                               \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
-  }
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_MMI
-ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
-ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
-#endif
-#undef ANY11B
-
-// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
-    memset(temp, 0, 64); /* for msan */                                        \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
-  }
-
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
-       ARGBToRGB565DitherRow_SSE2,
-       const uint32_t,
-       4,
-       2,
-       3)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
-       ARGBToRGB565DitherRow_AVX2,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON,
-       ARGBToRGB565DitherRow_NEON,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
-ANY11P(ARGBToRGB565DitherRow_Any_MSA,
-       ARGBToRGB565DitherRow_MSA,
-       const uint32_t,
-       4,
-       2,
-       7)
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-ANY11P(ARGBToRGB565DitherRow_Any_MMI,
-       ARGBToRGB565DitherRow_MMI,
-       const uint32_t,
-       4,
-       2,
-       3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MSA
-ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_MMI
-ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
-#endif
-#undef ANY11P
-#undef ANY11P
-
-// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
-#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
-  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
-    SIMD_ALIGNED(STYPE temp[32]);                                            \
-    SIMD_ALIGNED(DTYPE out[32]);                                             \
-    memset(temp, 0, 32 * SBPP); /* for msan */                               \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
-    }                                                                        \
-    memcpy(temp, src_ptr + n, r * SBPP);                                     \
-    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
-    memcpy(dst_ptr + n, out, r * BPP);                                       \
-  }
-
-#ifdef HAS_CONVERT16TO8ROW_SSSE3
-ANY11C(Convert16To8Row_Any_SSSE3,
-       Convert16To8Row_SSSE3,
-       2,
-       1,
-       uint16_t,
-       uint8_t,
-       15)
-#endif
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-ANY11C(Convert16To8Row_Any_AVX2,
-       Convert16To8Row_AVX2,
-       2,
-       1,
-       uint16_t,
-       uint8_t,
-       31)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_SSE2
-ANY11C(Convert8To16Row_Any_SSE2,
-       Convert8To16Row_SSE2,
-       1,
-       2,
-       uint8_t,
-       uint16_t,
-       15)
-#endif
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-ANY11C(Convert8To16Row_Any_AVX2,
-       Convert8To16Row_AVX2,
-       1,
-       2,
-       uint8_t,
-       uint16_t,
-       31)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
-  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
-    SIMD_ALIGNED(ST temp[32]);                                          \
-    SIMD_ALIGNED(T out[32]);                                            \
-    memset(temp, 0, SBPP * 32); /* for msan */                          \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
-    }                                                                   \
-    memcpy(temp, src_ptr + n, r * SBPP);                                \
-    ANY_SIMD(temp, out, param, MASK + 1);                               \
-    memcpy(dst_ptr + n, out, r * BPP);                                  \
-  }
-
-#ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
-#endif
-#ifdef HAS_HALFFLOATROW_AVX2
-ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
-#endif
-#ifdef HAS_HALFFLOATROW_F16C
-ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_F16C,
-         HalfFloat1Row_F16C,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         15)
-#endif
-#ifdef HAS_HALFFLOATROW_NEON
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON,
-         HalfFloat1Row_NEON,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         7)
-#endif
-#ifdef HAS_HALFFLOATROW_MSA
-ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
-#endif
-#ifdef HAS_BYTETOFLOATROW_NEON
-ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
-#endif
-#undef ANY11P16
-
-// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
-               const struct YuvConstants* yuvconstants, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
-    memset(temp, 0, 128); /* for YUY2 and msan */                         \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
-    }                                                                     \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
-  }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
-#endif
-#if defined(HAS_YUY2TOARGBROW_MSA)
-ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
-#endif
-#undef ANY11C
-
-// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
-               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
-    memset(temp, 0, 64 * 2); /* for msan */                                  \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-    }                                                                        \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-  }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_NEON
-ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
-#endif
-#ifdef HAS_INTERPOLATEROW_MSA
-ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
-#endif
-#ifdef HAS_INTERPOLATEROW_MMI
-ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
-#endif
-#undef ANY11T
-
-// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
-    memset(temp, 0, 64); /* for msan */                                   \
-    int r = width & MASK;                                                 \
-    int n = width & ~MASK;                                                \
-    if (n > 0) {                                                          \
-      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
-    }                                                                     \
-    memcpy(temp, src_ptr, r* BPP);                                        \
-    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
-    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
-  }
-
-#ifdef HAS_MIRRORROW_AVX2
-ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
-#endif
-#ifdef HAS_MIRRORROW_SSSE3
-ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
-#endif
-#ifdef HAS_MIRRORROW_MSA
-ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
-#endif
-#ifdef HAS_MIRRORROW_MMI
-ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_AVX2
-ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
-#endif
-#ifdef HAS_ARGBMIRRORROW_SSE2
-ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MSA
-ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
-#endif
-#ifdef HAS_ARGBMIRRORROW_MMI
-ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
-#endif
-#undef ANY11M
-
-// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
-  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64]);                  \
-    int r = width & MASK;                            \
-    int n = width & ~MASK;                           \
-    if (n > 0) {                                     \
-      ANY_SIMD(dst_ptr, v32, n);                     \
-    }                                                \
-    ANY_SIMD(temp, v32, MASK + 1);                   \
-    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
-  }
-
-#ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
-#endif
-#ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
-#endif
-#ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
-#endif
-#ifdef HAS_ARGBSETROW_MSA
-ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
-#endif
-#undef ANY1
-
-// Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
-               int width) {                                             \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
-    memset(temp, 0, 128); /* for msan */                                \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
-    }                                                                   \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
-    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
-    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
-  }
-
-#ifdef HAS_SPLITUVROW_SSE2
-ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
-#endif
-#ifdef HAS_SPLITUVROW_MSA
-ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
-#endif
-#ifdef HAS_SPLITUVROW_MMI
-ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
-#endif
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_SSE2
-ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
-ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MSA
-ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
-ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
-ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_MMI
-ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
-ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
-ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
-#endif
-#undef ANY12
-
-// Any 1 to 3.  Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
-               uint8_t* dst_b, int width) {                                \
-    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
-    memset(temp, 0, 16 * 3); /* for msan */                                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
-    }                                                                      \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
-    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
-    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
-    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
-    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
-  }
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_NEON
-ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
-#endif
-#ifdef HAS_SPLITRGBROW_MMI
-ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
-#endif
-
-// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
-               uint8_t* dst_v, int width) {                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-    }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
-           SS(r, UVSHIFT) * BPP);                                            \
-    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
-             BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-    }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-  }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_AVX2
-ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
-ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
-ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
-ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_AVX2
-ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
-ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
-ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
-ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_MSA
-ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_MMI
-ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_NEON
-ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MSA
-ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_MMI
-ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_NEON
-ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_BGRATOUVROW_MMI
-ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_NEON
-ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_ABGRTOUVROW_MMI
-ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_NEON
-ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
-#endif
-#ifdef HAS_RGBATOUVROW_MMI
-ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_NEON
-ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MSA
-ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RGB24TOUVROW_MMI
-ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_NEON
-ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MSA
-ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
-#endif
-#ifdef HAS_RAWTOUVROW_MMI
-ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_NEON
-ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MSA
-ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_RGB565TOUVROW_MMI
-ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_NEON
-ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MSA
-ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVROW_MMI
-ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_NEON
-ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVROW_MMI
-ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_NEON
-ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_NEON
-ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
-#endif
-#ifdef HAS_YUY2TOUVROW_MSA
-ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_YUY2TOUVROW_MMI
-ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
-#endif
-#ifdef HAS_UYVYTOUVROW_MSA
-ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
-#endif
-#ifdef HAS_UYVYTOUVROW_MMI
-ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
-#endif
-#undef ANY12S
-
-// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
-               int width) {                                                  \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
-    memset(temp, 0, 128 * 2); /* for msan */                                 \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
-    }                                                                        \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
-    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
-           SS(r, UVSHIFT) * BPP);                                            \
-    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
-             BPP);                                                           \
-      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-    }                                                                        \
-    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
-    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
-  }
-
-#ifdef HAS_AYUVTOVUROW_NEON
-ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
-ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
-#endif
-#undef ANY11S
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc
deleted file mode 100644
index 11f78e0d..00000000
--- a/files/source/row_dspr2.cc
+++ /dev/null
@@ -1,1721 +0,0 @@
-/*
- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  __asm__ __volatile__(
-      ".set      noreorder                         \n"
-      ".set      noat                              \n"
-      "slti      $at, %[count], 8                  \n"
-      "bne       $at ,$zero, $last8                \n"
-      "xor       $t8, %[src], %[dst]               \n"
-      "andi      $t8, $t8, 0x3                     \n"
-
-      "bne       $t8, $zero, unaligned             \n"
-      "negu      $a3, %[dst]                       \n"
-      // make dst/src aligned
-      "andi      $a3, $a3, 0x3                     \n"
-      "beq       $a3, $zero, $chk16w               \n"
-      // word-aligned now count is the remining bytes count
-      "subu     %[count], %[count], $a3            \n"
-
-      "lwr       $t8, 0(%[src])                    \n"
-      "addu      %[src], %[src], $a3               \n"
-      "swr       $t8, 0(%[dst])                    \n"
-      "addu      %[dst], %[dst], $a3               \n"
-
-      // Now the dst/src are mutually word-aligned with word-aligned addresses
-      "$chk16w:                                    \n"
-      "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-      // t8 is the byte count after 64-byte chunks
-      "beq       %[count], $t8, chk8w              \n"
-      // There will be at most 1 32-byte chunk after it
-      "subu      $a3, %[count], $t8                \n"  // the reminder
-      // Here a3 counts bytes in 16w chunks
-      "addu      $a3, %[dst], $a3                  \n"
-      // Now a3 is the final dst after 64-byte chunks
-      "addu      $t0, %[dst], %[count]             \n"
-      // t0 is the "past the end" address
-
-      // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
-      // past
-      // the "t0-32" address
-      // This means: for x=128 the last "safe" a1 address is "t0-160"
-      // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
-      // we will use "pref 30,128(a1)", so "t0-160" is the limit
-      "subu      $t9, $t0, 160                     \n"
-      // t9 is the "last safe pref 30,128(a1)" address
-      "pref      0, 0(%[src])                      \n"  // first line of src
-      "pref      0, 32(%[src])                     \n"  // second line of src
-      "pref      0, 64(%[src])                     \n"
-      "pref      30, 32(%[dst])                    \n"
-      // In case the a1 > t9 don't use "pref 30" at all
-      "sltu      $v1, $t9, %[dst]                  \n"
-      "bgtz      $v1, $loop16w                     \n"
-      "nop                                         \n"
-      // otherwise, start with using pref30
-      "pref      30, 64(%[dst])                    \n"
-      "$loop16w:                                    \n"
-      "pref      0, 96(%[src])                     \n"
-      "lw        $t0, 0(%[src])                    \n"
-      "bgtz      $v1, $skip_pref30_96              \n"  // skip
-      "lw        $t1, 4(%[src])                    \n"
-      "pref      30, 96(%[dst])                    \n"  // continue
-      "$skip_pref30_96:                            \n"
-      "lw        $t2, 8(%[src])                    \n"
-      "lw        $t3, 12(%[src])                   \n"
-      "lw        $t4, 16(%[src])                   \n"
-      "lw        $t5, 20(%[src])                   \n"
-      "lw        $t6, 24(%[src])                   \n"
-      "lw        $t7, 28(%[src])                   \n"
-      "pref      0, 128(%[src])                    \n"
-      //  bring the next lines of src, addr 128
-      "sw        $t0, 0(%[dst])                    \n"
-      "sw        $t1, 4(%[dst])                    \n"
-      "sw        $t2, 8(%[dst])                    \n"
-      "sw        $t3, 12(%[dst])                   \n"
-      "sw        $t4, 16(%[dst])                   \n"
-      "sw        $t5, 20(%[dst])                   \n"
-      "sw        $t6, 24(%[dst])                   \n"
-      "sw        $t7, 28(%[dst])                   \n"
-      "lw        $t0, 32(%[src])                   \n"
-      "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
-      "lw        $t1, 36(%[src])                   \n"
-      "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-      "$skip_pref30_128:                           \n"
-      "lw        $t2, 40(%[src])                   \n"
-      "lw        $t3, 44(%[src])                   \n"
-      "lw        $t4, 48(%[src])                   \n"
-      "lw        $t5, 52(%[src])                   \n"
-      "lw        $t6, 56(%[src])                   \n"
-      "lw        $t7, 60(%[src])                   \n"
-      "pref      0, 160(%[src])                    \n"
-      // bring the next lines of src, addr 160
-      "sw        $t0, 32(%[dst])                   \n"
-      "sw        $t1, 36(%[dst])                   \n"
-      "sw        $t2, 40(%[dst])                   \n"
-      "sw        $t3, 44(%[dst])                   \n"
-      "sw        $t4, 48(%[dst])                   \n"
-      "sw        $t5, 52(%[dst])                   \n"
-      "sw        $t6, 56(%[dst])                   \n"
-      "sw        $t7, 60(%[dst])                   \n"
-
-      "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
-      "sltu      $v1, $t9, %[dst]                  \n"
-      "bne       %[dst], $a3, $loop16w             \n"
-      " addiu    %[src], %[src], 64                \n"  // adding 64 to src
-      "move      %[count], $t8                     \n"
-
-      // Here we have src and dest word-aligned but less than 64-bytes to go
-
-      "chk8w:                                      \n"
-      "pref      0, 0x0(%[src])                    \n"
-      "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-      // the t8 is the reminder count past 32-bytes
-      "beq       %[count], $t8, chk1w              \n"
-      // count=t8,no 32-byte chunk
-      " nop                                        \n"
-
-      "lw        $t0, 0(%[src])                    \n"
-      "lw        $t1, 4(%[src])                    \n"
-      "lw        $t2, 8(%[src])                    \n"
-      "lw        $t3, 12(%[src])                   \n"
-      "lw        $t4, 16(%[src])                   \n"
-      "lw        $t5, 20(%[src])                   \n"
-      "lw        $t6, 24(%[src])                   \n"
-      "lw        $t7, 28(%[src])                   \n"
-      "addiu     %[src], %[src], 32                \n"
-
-      "sw        $t0, 0(%[dst])                    \n"
-      "sw        $t1, 4(%[dst])                    \n"
-      "sw        $t2, 8(%[dst])                    \n"
-      "sw        $t3, 12(%[dst])                   \n"
-      "sw        $t4, 16(%[dst])                   \n"
-      "sw        $t5, 20(%[dst])                   \n"
-      "sw        $t6, 24(%[dst])                   \n"
-      "sw        $t7, 28(%[dst])                   \n"
-      "addiu     %[dst], %[dst], 32                \n"
-
-      "chk1w:                                      \n"
-      "andi      %[count], $t8, 0x3                \n"
-      // now count is the reminder past 1w chunks
-      "beq       %[count], $t8, $last8             \n"
-      " subu     $a3, $t8, %[count]                \n"
-      // a3 is count of bytes in 1w chunks
-      "addu      $a3, %[dst], $a3                  \n"
-      // now a3 is the dst address past the 1w chunks
-      // copying in words (4-byte chunks)
-      "$wordCopy_loop:                             \n"
-      "lw        $t3, 0(%[src])                    \n"
-      // the first t3 may be equal t0 ... optimize?
-      "addiu     %[src], %[src],4                  \n"
-      "addiu     %[dst], %[dst],4                  \n"
-      "bne       %[dst], $a3,$wordCopy_loop        \n"
-      " sw       $t3, -4(%[dst])                   \n"
-
-      // For the last (<8) bytes
-      "$last8:                                     \n"
-      "blez      %[count], leave                   \n"
-      " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
-      "$last8loop:                                 \n"
-      "lb        $v1, 0(%[src])                    \n"
-      "addiu     %[src], %[src], 1                 \n"
-      "addiu     %[dst], %[dst], 1                 \n"
-      "bne       %[dst], $a3, $last8loop           \n"
-      " sb       $v1, -1(%[dst])                   \n"
-
-      "leave:                                      \n"
-      "  j       $ra                               \n"
-      "  nop                                       \n"
-
-      //
-      // UNALIGNED case
-      //
-
-      "unaligned:                                  \n"
-      // got here with a3="negu a1"
-      "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
-      "beqz      $a3, $ua_chk16w                   \n"
-      " subu     %[count], %[count], $a3           \n"
-      // bytes left after initial a3 bytes
-      "lwr       $v1, 0(%[src])                    \n"
-      "lwl       $v1, 3(%[src])                    \n"
-      "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
-      "swr       $v1, 0(%[dst])                    \n"
-      "addu      %[dst], %[dst], $a3               \n"
-      // below the dst will be word aligned (NOTE1)
-      "$ua_chk16w:                                 \n"
-      "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-      // t8 is the byte count after 64-byte chunks
-      "beq       %[count], $t8, ua_chk8w           \n"
-      // if a2==t8, no 64-byte chunks
-      // There will be at most 1 32-byte chunk after it
-      "subu      $a3, %[count], $t8                \n"  // the reminder
-      // Here a3 counts bytes in 16w chunks
-      "addu      $a3, %[dst], $a3                  \n"
-      // Now a3 is the final dst after 64-byte chunks
-      "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
-      "subu      $t9, $t0, 160                     \n"
-      // t9 is the "last safe pref 30,128(a1)" address
-      "pref      0, 0(%[src])                      \n"  // first line of src
-      "pref      0, 32(%[src])                     \n"  // second line  addr 32
-      "pref      0, 64(%[src])                     \n"
-      "pref      30, 32(%[dst])                    \n"
-      // safe, as we have at least 64 bytes ahead
-      // In case the a1 > t9 don't use "pref 30" at all
-      "sltu      $v1, $t9, %[dst]                  \n"
-      "bgtz      $v1, $ua_loop16w                  \n"
-      // skip "pref 30,64(a1)" for too short arrays
-      " nop                                        \n"
-      // otherwise, start with using pref30
-      "pref      30, 64(%[dst])                    \n"
-      "$ua_loop16w:                                \n"
-      "pref      0, 96(%[src])                     \n"
-      "lwr       $t0, 0(%[src])                    \n"
-      "lwl       $t0, 3(%[src])                    \n"
-      "lwr       $t1, 4(%[src])                    \n"
-      "bgtz      $v1, $ua_skip_pref30_96           \n"
-      " lwl      $t1, 7(%[src])                    \n"
-      "pref      30, 96(%[dst])                    \n"
-      // continue setting up the dest, addr 96
-      "$ua_skip_pref30_96:                         \n"
-      "lwr       $t2, 8(%[src])                    \n"
-      "lwl       $t2, 11(%[src])                   \n"
-      "lwr       $t3, 12(%[src])                   \n"
-      "lwl       $t3, 15(%[src])                   \n"
-      "lwr       $t4, 16(%[src])                   \n"
-      "lwl       $t4, 19(%[src])                   \n"
-      "lwr       $t5, 20(%[src])                   \n"
-      "lwl       $t5, 23(%[src])                   \n"
-      "lwr       $t6, 24(%[src])                   \n"
-      "lwl       $t6, 27(%[src])                   \n"
-      "lwr       $t7, 28(%[src])                   \n"
-      "lwl       $t7, 31(%[src])                   \n"
-      "pref      0, 128(%[src])                    \n"
-      // bring the next lines of src, addr 128
-      "sw        $t0, 0(%[dst])                    \n"
-      "sw        $t1, 4(%[dst])                    \n"
-      "sw        $t2, 8(%[dst])                    \n"
-      "sw        $t3, 12(%[dst])                   \n"
-      "sw        $t4, 16(%[dst])                   \n"
-      "sw        $t5, 20(%[dst])                   \n"
-      "sw        $t6, 24(%[dst])                   \n"
-      "sw        $t7, 28(%[dst])                   \n"
-      "lwr       $t0, 32(%[src])                   \n"
-      "lwl       $t0, 35(%[src])                   \n"
-      "lwr       $t1, 36(%[src])                   \n"
-      "bgtz      $v1, ua_skip_pref30_128           \n"
-      " lwl      $t1, 39(%[src])                   \n"
-      "pref      30, 128(%[dst])                   \n"
-      // continue setting up the dest, addr 128
-      "ua_skip_pref30_128:                         \n"
-
-      "lwr       $t2, 40(%[src])                   \n"
-      "lwl       $t2, 43(%[src])                   \n"
-      "lwr       $t3, 44(%[src])                   \n"
-      "lwl       $t3, 47(%[src])                   \n"
-      "lwr       $t4, 48(%[src])                   \n"
-      "lwl       $t4, 51(%[src])                   \n"
-      "lwr       $t5, 52(%[src])                   \n"
-      "lwl       $t5, 55(%[src])                   \n"
-      "lwr       $t6, 56(%[src])                   \n"
-      "lwl       $t6, 59(%[src])                   \n"
-      "lwr       $t7, 60(%[src])                   \n"
-      "lwl       $t7, 63(%[src])                   \n"
-      "pref      0, 160(%[src])                    \n"
-      // bring the next lines of src, addr 160
-      "sw        $t0, 32(%[dst])                   \n"
-      "sw        $t1, 36(%[dst])                   \n"
-      "sw        $t2, 40(%[dst])                   \n"
-      "sw        $t3, 44(%[dst])                   \n"
-      "sw        $t4, 48(%[dst])                   \n"
-      "sw        $t5, 52(%[dst])                   \n"
-      "sw        $t6, 56(%[dst])                   \n"
-      "sw        $t7, 60(%[dst])                   \n"
-
-      "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
-      "sltu      $v1,$t9,%[dst]                    \n"
-      "bne       %[dst],$a3,$ua_loop16w            \n"
-      " addiu    %[src],%[src],64                  \n"  // adding 64 to src
-      "move      %[count],$t8                      \n"
-
-      // Here we have src and dest word-aligned but less than 64-bytes to go
-
-      "ua_chk8w:                                   \n"
-      "pref      0, 0x0(%[src])                    \n"
-      "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-      // the t8 is the reminder count
-      "beq       %[count], $t8, $ua_chk1w          \n"
-      // when count==t8, no 32-byte chunk
-
-      "lwr       $t0, 0(%[src])                    \n"
-      "lwl       $t0, 3(%[src])                    \n"
-      "lwr       $t1, 4(%[src])                    \n"
-      "lwl       $t1, 7(%[src])                    \n"
-      "lwr       $t2, 8(%[src])                    \n"
-      "lwl       $t2, 11(%[src])                   \n"
-      "lwr       $t3, 12(%[src])                   \n"
-      "lwl       $t3, 15(%[src])                   \n"
-      "lwr       $t4, 16(%[src])                   \n"
-      "lwl       $t4, 19(%[src])                   \n"
-      "lwr       $t5, 20(%[src])                   \n"
-      "lwl       $t5, 23(%[src])                   \n"
-      "lwr       $t6, 24(%[src])                   \n"
-      "lwl       $t6, 27(%[src])                   \n"
-      "lwr       $t7, 28(%[src])                   \n"
-      "lwl       $t7, 31(%[src])                   \n"
-      "addiu     %[src], %[src], 32                \n"
-
-      "sw        $t0, 0(%[dst])                    \n"
-      "sw        $t1, 4(%[dst])                    \n"
-      "sw        $t2, 8(%[dst])                    \n"
-      "sw        $t3, 12(%[dst])                   \n"
-      "sw        $t4, 16(%[dst])                   \n"
-      "sw        $t5, 20(%[dst])                   \n"
-      "sw        $t6, 24(%[dst])                   \n"
-      "sw        $t7, 28(%[dst])                   \n"
-      "addiu     %[dst], %[dst], 32                \n"
-
-      "$ua_chk1w:                                  \n"
-      "andi      %[count], $t8, 0x3                \n"
-      // now count is the reminder past 1w chunks
-      "beq       %[count], $t8, ua_smallCopy       \n"
-      "subu      $a3, $t8, %[count]                \n"
-      // a3 is count of bytes in 1w chunks
-      "addu      $a3, %[dst], $a3                  \n"
-      // now a3 is the dst address past the 1w chunks
-
-      // copying in words (4-byte chunks)
-      "$ua_wordCopy_loop:                          \n"
-      "lwr       $v1, 0(%[src])                    \n"
-      "lwl       $v1, 3(%[src])                    \n"
-      "addiu     %[src], %[src], 4                 \n"
-      "addiu     %[dst], %[dst], 4                 \n"
-      // note: dst=a1 is word aligned here, see NOTE1
-      "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
-      " sw       $v1,-4(%[dst])                    \n"
-
-      // Now less than 4 bytes (value in count) left to copy
-      "ua_smallCopy:                               \n"
-      "beqz      %[count], leave                   \n"
-      " addu     $a3, %[dst], %[count]             \n"  // a3 = last dst address
-      "$ua_smallCopy_loop:                         \n"
-      "lb        $v1, 0(%[src])                    \n"
-      "addiu     %[src], %[src], 1                 \n"
-      "addiu     %[dst], %[dst], 1                 \n"
-      "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
-      " sb       $v1, -1(%[dst])                   \n"
-
-      "j         $ra                               \n"
-      " nop                                        \n"
-      ".set      at                                \n"
-      ".set      reorder                           \n"
-      : [dst] "+r"(dst), [src] "+r"(src)
-      : [count] "r"(count)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
-        "at");
-}
-#endif  // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) &&   \
-    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
-    (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv,
-                      uint8* dst_u,
-                      uint8* dst_v,
-                      int width) {
-  __asm__ __volatile__(
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-      "srl             $t4, %[width], 4              \n"  // multiplies of 16
-      "blez            $t4, 2f                       \n"
-      " andi           %[width], %[width], 0xf       \n"  // residual
-
-      "1:                                            \n"
-      "addiu           $t4, $t4, -1                  \n"
-      "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-      "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-      "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-      "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-      "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-      "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 |
-                                                          // U10
-      "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 |
-                                                          // U12
-      "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 |
-                                                          // U14
-      "addiu           %[src_uv], %[src_uv], 32      \n"
-      "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-      "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-      "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-      "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-      "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-      "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-      "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 |
-                                                          // V12
-      "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 |
-                                                          // U12
-      "sw              $t9, 0(%[dst_v])              \n"
-      "sw              $t0, 0(%[dst_u])              \n"
-      "sw              $t1, 4(%[dst_v])              \n"
-      "sw              $t2, 4(%[dst_u])              \n"
-      "sw              $t3, 8(%[dst_v])              \n"
-      "sw              $t5, 8(%[dst_u])              \n"
-      "sw              $t6, 12(%[dst_v])             \n"
-      "sw              $t7, 12(%[dst_u])             \n"
-      "addiu           %[dst_v], %[dst_v], 16        \n"
-      "bgtz            $t4, 1b                       \n"
-      " addiu          %[dst_u], %[dst_u], 16        \n"
-
-      "beqz            %[width], 3f                  \n"
-      " nop                                          \n"
-
-      "2:                                              \n"
-      "lbu             $t0, 0(%[src_uv])             \n"
-      "lbu             $t1, 1(%[src_uv])             \n"
-      "addiu           %[src_uv], %[src_uv], 2       \n"
-      "addiu           %[width], %[width], -1        \n"
-      "sb              $t0, 0(%[dst_u])              \n"
-      "sb              $t1, 0(%[dst_v])              \n"
-      "addiu           %[dst_u], %[dst_u], 1         \n"
-      "bgtz            %[width], 2b                  \n"
-      " addiu          %[dst_v], %[dst_v], 1         \n"
-
-      "3:                                              \n"
-      ".set pop                                      \n"
-      : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
-        [dst_v] "+r"(dst_v)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
-  __asm__ __volatile__(
-      ".set push                             \n"
-      ".set noreorder                        \n"
-
-      "srl       $t4, %[width], 4            \n"  // multiplies of 16
-      "andi      $t5, %[width], 0xf          \n"
-      "blez      $t4, 2f                     \n"
-      " addu     %[src], %[src], %[width]    \n"  // src += width
-
-      "1:                                     \n"
-      "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-      "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-      "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-      "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-      "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-      "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-      "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-      "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-      "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-      "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-      "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-      "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-      "addiu     %[src], %[src], -16         \n"
-      "addiu     $t4, $t4, -1                \n"
-      "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-      "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-      "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-      "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-      "bgtz      $t4, 1b                     \n"
-      " addiu    %[dst], %[dst], 16          \n"
-      "beqz      $t5, 3f                     \n"
-      " nop                                  \n"
-
-      "2:                                     \n"
-      "lbu       $t0, -1(%[src])             \n"
-      "addiu     $t5, $t5, -1                \n"
-      "addiu     %[src], %[src], -1          \n"
-      "sb        $t0, 0(%[dst])              \n"
-      "bgez      $t5, 2b                     \n"
-      " addiu    %[dst], %[dst], 1           \n"
-
-      "3:                                     \n"
-      ".set pop                              \n"
-      : [src] "+r"(src), [dst] "+r"(dst)
-      : [width] "r"(width)
-      : "t0", "t1", "t2", "t3", "t4", "t5");
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv,
-                       uint8* dst_u,
-                       uint8* dst_v,
-                       int width) {
-  int x;
-  int y;
-  __asm__ __volatile__(
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "addu            $t4, %[width], %[width]      \n"
-      "srl             %[x], %[width], 4            \n"
-      "andi            %[y], %[width], 0xf          \n"
-      "blez            %[x], 2f                     \n"
-      " addu           %[src_uv], %[src_uv], $t4    \n"
-
-      "1:                                          \n"
-      "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-      "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-      "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-      "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-      "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-      "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-      "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-      "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-
-      "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-      "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-      "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-      "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-      "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-      "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-      "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-      "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-      "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-      "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-      "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-      "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-      "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-      "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-      "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-      "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-      "addiu           %[src_uv], %[src_uv], -32    \n"
-      "addiu           %[x], %[x], -1               \n"
-      "swr             $t4, 0(%[dst_u])             \n"
-      "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-      "swr             $t6, 0(%[dst_v])             \n"
-      "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-      "swr             $t2, 4(%[dst_u])             \n"
-      "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-      "swr             $t3, 4(%[dst_v])             \n"
-      "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-      "swr             $t0, 8(%[dst_u])             \n"
-      "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-      "swr             $t1, 8(%[dst_v])             \n"
-      "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-      "swr             $t9, 12(%[dst_u])            \n"
-      "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-      "swr             $t5, 12(%[dst_v])            \n"
-      "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-      "addiu           %[dst_v], %[dst_v], 16       \n"
-      "bgtz            %[x], 1b                     \n"
-      " addiu          %[dst_u], %[dst_u], 16       \n"
-      "beqz            %[y], 3f                     \n"
-      " nop                                         \n"
-      "b               2f                           \n"
-      " nop                                         \n"
-
-      "2:                                            \n"
-      "lbu             $t0, -2(%[src_uv])           \n"
-      "lbu             $t1, -1(%[src_uv])           \n"
-      "addiu           %[src_uv], %[src_uv], -2     \n"
-      "addiu           %[y], %[y], -1               \n"
-      "sb              $t0, 0(%[dst_u])             \n"
-      "sb              $t1, 0(%[dst_v])             \n"
-      "addiu           %[dst_u], %[dst_u], 1        \n"
-      "bgtz            %[y], 2b                     \n"
-      " addiu          %[dst_v], %[dst_v], 1        \n"
-
-      "3:                                            \n"
-      ".set pop                                     \n"
-      : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
-        [x] "=&r"(x), [y] "=&r"(y)
-      : [width] "r"(width)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
-}
-
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  int x;
-  uint32 tmp_ub = yuvconstants->kUVToB[0];
-  uint32 tmp_ug = yuvconstants->kUVToG[0];
-  uint32 tmp_vg = yuvconstants->kUVToG[1];
-  uint32 tmp_vr = yuvconstants->kUVToR[1];
-  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
-  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
-  uint32 tmp_br = yuvconstants->kUVBiasR[0];
-  uint32 yg = yuvconstants->kYToRgb[0];
-  uint32 tmp_yg;
-  uint32 tmp_mask = 0x7fff7fff;
-  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
-  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
-  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
-  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
-  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
-  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
-  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
-  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
-  yg = yg * 0x0101;
-
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
-    __asm__ __volatile__(
-        ".set push                                             \n"
-        ".set noreorder                                        \n"
-        "lbu              %[tmp_t7], 0(%[src_y])               \n"
-        "lbu              %[tmp_t1], 1(%[src_y])               \n"
-        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-        "lbu              %[tmp_t2], 0(%[src_u])               \n"
-        "lbu              %[tmp_t3], 0(%[src_v])               \n"
-        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
-        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
-        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
-        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.ph.w      %[tmp_t9], %[tmp_t8],     %[tmp_t7]  \n"
-        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.qb.ph     %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
-        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
-        ".set pop                                              \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
-          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
-          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
-          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 8;  // Advance 4 pixels.
-  }
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr,
-                          const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  int y0_fraction = 256 - source_y_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__(
-      ".set push                                           \n"
-      ".set noreorder                                      \n"
-
-      "replv.ph          $t0, %[y0_fraction]               \n"
-      "replv.ph          $t1, %[source_y_fraction]         \n"
-
-      "1:                                                    \n"
-      "lw                $t2, 0(%[src_ptr])                \n"
-      "lw                $t3, 0(%[src_ptr1])               \n"
-      "lw                $t4, 4(%[src_ptr])                \n"
-      "lw                $t5, 4(%[src_ptr1])               \n"
-      "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-      "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-      "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-      "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-      "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-      "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-      "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-      "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-      "addq.ph           $t6, $t6, $t8                     \n"
-      "addq.ph           $t7, $t7, $t9                     \n"
-      "addq.ph           $t2, $t2, $t4                     \n"
-      "addq.ph           $t3, $t3, $t5                     \n"
-      "shra_r.ph         $t6, $t6, 8                       \n"
-      "shra_r.ph         $t7, $t7, 8                       \n"
-      "shra_r.ph         $t2, $t2, 8                       \n"
-      "shra_r.ph         $t3, $t3, 8                       \n"
-      "precr.qb.ph       $t6, $t6, $t7                     \n"
-      "precr.qb.ph       $t2, $t2, $t3                     \n"
-      "addiu             %[src_ptr], %[src_ptr], 8         \n"
-      "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-      "addiu             %[dst_width], %[dst_width], -8    \n"
-      "sw                $t6, 0(%[dst_ptr])                \n"
-      "sw                $t2, 4(%[dst_ptr])                \n"
-      "bgtz              %[dst_width], 1b                  \n"
-      " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-      ".set pop                                            \n"
-      : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
-        [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
-      : [source_y_fraction] "r"(source_y_fraction),
-        [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-#include <stdio.h>
-void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  int x;
-  uint32 tmp_mask = 0xff;
-  uint32 tmp_t1;
-  for (x = 0; x < (width - 1); ++x) {
-    __asm__ __volatile__(
-        ".set push                                                  \n"
-        ".set noreorder                                             \n"
-        "ulw             %[tmp_t1],    0(%[src_rgb24])              \n"
-        "addiu           %[dst_argb],  %[dst_argb],     4           \n"
-        "addiu           %[src_rgb24], %[src_rgb24],    3           \n"
-        "ins             %[tmp_t1],    %[tmp_mask],     24,    8    \n"
-        "sw              %[tmp_t1],    -4(%[dst_argb])              \n"
-        ".set pop                                                   \n"
-        : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
-          [tmp_t1] "=&r"(tmp_t1)
-        : [tmp_mask] "r"(tmp_mask)
-        : "memory");
-  }
-  uint8 b = src_rgb24[0];
-  uint8 g = src_rgb24[1];
-  uint8 r = src_rgb24[2];
-  dst_argb[0] = b;
-  dst_argb[1] = g;
-  dst_argb[2] = r;
-  dst_argb[3] = 255u;
-}
-
-void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
-  int x;
-  uint32 tmp_mask = 0xff;
-  uint32 tmp_t1, tmp_t2;
-  for (x = 0; x < (width - 1); ++x) {
-    __asm__ __volatile__(
-        ".set push                                               \n"
-        ".set noreorder                                          \n"
-        "ulw               %[tmp_t1],   0(%[src_raw])            \n"
-        "addiu             %[dst_argb], %[dst_argb],      4      \n"
-        "addiu             %[src_raw],  %[src_raw],       3      \n"
-        "srl               %[tmp_t2],   %[tmp_t1],        16     \n"
-        "ins               %[tmp_t1],   %[tmp_mask],      24, 8  \n"
-        "ins               %[tmp_t1],   %[tmp_t1],        16, 8  \n"
-        "ins               %[tmp_t1],   %[tmp_t2],        0,  8  \n"
-        "sw                %[tmp_t1],   -4(%[dst_argb])          \n"
-        ".set pop                                                \n"
-        : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
-          [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
-        : [tmp_mask] "r"(tmp_mask)
-        : "memory");
-  }
-  uint8 r = src_raw[0];
-  uint8 g = src_raw[1];
-  uint8 b = src_raw[2];
-  dst_argb[0] = b;
-  dst_argb[1] = g;
-  dst_argb[2] = r;
-  dst_argb[3] = 255u;
-}
-
-void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
-                           uint8* dst_argb,
-                           int width) {
-  int x;
-  uint32 tmp_mask = 0xff;
-  uint32 tmp_t1, tmp_t2, tmp_t3;
-  for (x = 0; x < width; ++x) {
-    __asm__ __volatile__(
-        ".set push                                                   \n"
-        ".set noreorder                                              \n"
-        "lhu               %[tmp_t1],     0(%[src_rgb565])           \n"
-        "addiu             %[dst_argb],   %[dst_argb],      4        \n"
-        "addiu             %[src_rgb565], %[src_rgb565],    2        \n"
-        "sll               %[tmp_t2],     %[tmp_t1],        8        \n"
-        "ins               %[tmp_t2],     %[tmp_mask],      24,8     \n"
-        "ins               %[tmp_t2],     %[tmp_t1],        3, 16    \n"
-        "ins               %[tmp_t2],     %[tmp_t1],        5, 11    \n"
-        "srl               %[tmp_t3],     %[tmp_t1],        9        \n"
-        "ins               %[tmp_t2],     %[tmp_t3],        8, 2     \n"
-        "ins               %[tmp_t2],     %[tmp_t1],        3, 5     \n"
-        "srl               %[tmp_t3],     %[tmp_t1],        2        \n"
-        "ins               %[tmp_t2],     %[tmp_t3],        0, 3     \n"
-        "sw                %[tmp_t2],     -4(%[dst_argb])            \n"
-        ".set pop                                                    \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
-          [dst_argb] "+r"(dst_argb)
-        : [tmp_mask] "r"(tmp_mask));
-  }
-}
-
-void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
-                             uint8* dst_argb,
-                             int width) {
-  int x;
-  uint32 tmp_t1, tmp_t2, tmp_t3;
-  for (x = 0; x < width; ++x) {
-    __asm__ __volatile__(
-        ".set push                                                   \n"
-        ".set noreorder                                              \n"
-        "lh                %[tmp_t1],       0(%[src_argb1555])       \n"
-        "addiu             %[dst_argb],     %[dst_argb],      4      \n"
-        "addiu             %[src_argb1555], %[src_argb1555],  2      \n"
-        "sll               %[tmp_t2],       %[tmp_t1],        9      \n"
-        "ins               %[tmp_t2],       %[tmp_t1],        4, 15  \n"
-        "ins               %[tmp_t2],       %[tmp_t1],        6, 10  \n"
-        "srl               %[tmp_t3],       %[tmp_t1],        7      \n"
-        "ins               %[tmp_t2],       %[tmp_t3],        8, 3   \n"
-        "ins               %[tmp_t2],       %[tmp_t1],        3, 5   \n"
-        "srl               %[tmp_t3],       %[tmp_t1],        2      \n"
-        "ins               %[tmp_t2],       %[tmp_t3],        0, 3   \n"
-        "sw                %[tmp_t2],       -4(%[dst_argb])          \n"
-        ".set pop                                                    \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
-          [dst_argb] "+r"(dst_argb)
-        :);
-  }
-}
-
-void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
-                             uint8* dst_argb,
-                             int width) {
-  int x;
-  uint32 tmp_t1;
-  for (x = 0; x < width; ++x) {
-    __asm__ __volatile__(
-        ".set push                                                    \n"
-        ".set noreorder                                               \n"
-        "lh                %[tmp_t1],       0(%[src_argb4444])        \n"
-        "addiu             %[dst_argb],     %[dst_argb],       4      \n"
-        "addiu             %[src_argb4444], %[src_argb4444],   2      \n"
-        "ins               %[tmp_t1],       %[tmp_t1],         16, 16 \n"
-        "ins               %[tmp_t1],       %[tmp_t1],         12, 16 \n"
-        "ins               %[tmp_t1],       %[tmp_t1],         8,  12 \n"
-        "ins               %[tmp_t1],       %[tmp_t1],         4,  8  \n"
-        "sw                %[tmp_t1],       -4(%[dst_argb])           \n"
-        ".set pop                                                     \n"
-        : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
-          [tmp_t1] "=&r"(tmp_t1));
-  }
-}
-
-void I444ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  int x;
-  uint32 tmp_ub = yuvconstants->kUVToB[0];
-  uint32 tmp_ug = yuvconstants->kUVToG[0];
-  uint32 tmp_vg = yuvconstants->kUVToG[1];
-  uint32 tmp_vr = yuvconstants->kUVToR[1];
-  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
-  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
-  uint32 tmp_br = yuvconstants->kUVBiasR[0];
-  uint32 yg = yuvconstants->kYToRgb[0];
-  uint32 tmp_mask = 0x7fff7fff;
-  uint32 tmp_yg;
-
-  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
-  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
-  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
-  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
-  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
-  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
-  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
-  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
-  yg = yg * 0x0101;
-
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
-    __asm__ __volatile__(
-        ".set push                                              \n"
-        ".set noreorder                                         \n"
-        "lbu              %[tmp_t7], 0(%[y_buf])               \n"
-        "lbu              %[tmp_t1], 1(%[y_buf])               \n"
-        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-        "lh               %[tmp_t2], 0(%[u_buf])               \n"
-        "lh               %[tmp_t3], 0(%[v_buf])               \n"
-        "preceu.ph.qbr    %[tmp_t2], %[tmp_t2]                 \n"
-        "preceu.ph.qbr    %[tmp_t3], %[tmp_t3]                 \n"
-        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
-        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
-        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
-        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
-        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
-        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
-        ".set pop                                              \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
-          [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
-          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
-          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
-          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
-    y_buf += 2;
-    u_buf += 2;
-    v_buf += 2;
-    rgb_buf += 8;  // Advance 1 pixel.
-  }
-}
-
-void I422ToARGB4444Row_DSPR2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb4444,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  int x;
-  uint32 tmp_ub = yuvconstants->kUVToB[0];
-  uint32 tmp_ug = yuvconstants->kUVToG[0];
-  uint32 tmp_vg = yuvconstants->kUVToG[1];
-  uint32 tmp_vr = yuvconstants->kUVToR[1];
-  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
-  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
-  uint32 tmp_br = yuvconstants->kUVBiasR[0];
-  uint32 yg = yuvconstants->kYToRgb[0];
-  uint32 tmp_yg;
-  uint32 tmp_mask = 0x7fff7fff;
-  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
-  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
-  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
-  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
-  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
-  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
-  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
-  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
-  yg = yg * 0x0101;
-
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
-    __asm__ __volatile__(
-        ".set push                                             \n"
-        ".set noreorder                                        \n"
-        "lbu              %[tmp_t7], 0(%[src_y])               \n"
-        "lbu              %[tmp_t1], 1(%[src_y])               \n"
-        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-        "lbu              %[tmp_t2], 0(%[src_u])               \n"
-        "lbu              %[tmp_t3], 0(%[src_v])               \n"
-        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
-        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
-        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
-        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
-        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
-        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
-        "shrl.qb          %[tmp_t1], %[tmp_t8],     4          \n"
-        "shrl.qb          %[tmp_t2], %[tmp_t7],     4          \n"
-        "shrl.ph          %[tmp_t8], %[tmp_t1],     4          \n"
-        "shrl.ph          %[tmp_t7], %[tmp_t2],     4          \n"
-        "or               %[tmp_t8], %[tmp_t8],     %[tmp_t1]  \n"
-        "or               %[tmp_t7], %[tmp_t7],     %[tmp_t2]  \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t7],     %[tmp_t8]  \n"
-        "sw               %[tmp_t8], 0(%[dst_argb4444])        \n"
-        ".set pop                                              \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
-          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
-          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
-          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
-          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb4444 += 4;  // Advance 2 pixels.
-  }
-}
-
-void I422ToARGB1555Row_DSPR2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb1555,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  int x;
-  uint32 tmp_ub = yuvconstants->kUVToB[0];
-  uint32 tmp_ug = yuvconstants->kUVToG[0];
-  uint32 tmp_vg = yuvconstants->kUVToG[1];
-  uint32 tmp_vr = yuvconstants->kUVToR[1];
-  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
-  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
-  uint32 tmp_br = yuvconstants->kUVBiasR[0];
-  uint32 yg = yuvconstants->kYToRgb[0];
-  uint32 tmp_yg;
-  uint32 tmp_mask = 0x80008000;
-  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
-  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
-  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
-  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
-  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
-  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
-  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
-  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
-  yg = yg * 0x0101;
-
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
-    __asm__ __volatile__(
-        ".set push                                             \n"
-        ".set noreorder                                        \n"
-        "lbu              %[tmp_t7], 0(%[src_y])               \n"
-        "lbu              %[tmp_t1], 1(%[src_y])               \n"
-        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-        "lbu              %[tmp_t2], 0(%[src_u])               \n"
-        "lbu              %[tmp_t3], 0(%[src_v])               \n"
-        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
-        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
-        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
-        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
-        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
-        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
-        "ins              %[tmp_t3], %[tmp_t8],     7,      24 \n"
-        "ins              %[tmp_t3], %[tmp_t8],     10,     16 \n"
-        "ins              %[tmp_t3], %[tmp_t8],     13,     8  \n"
-        "ins              %[tmp_t4], %[tmp_t7],     7,      24 \n"
-        "ins              %[tmp_t4], %[tmp_t7],     10,     16 \n"
-        "ins              %[tmp_t4], %[tmp_t7],     13,     8  \n"
-        "precrq.ph.w      %[tmp_t8], %[tmp_t4],     %[tmp_t3]  \n"
-        "or               %[tmp_t8], %[tmp_t8],     %[tmp_mask]\n"
-        "sw               %[tmp_t8], 0(%[dst_argb1555])        \n"
-        ".set pop                                              \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
-          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
-          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
-          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
-          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
-    src_y += 2;
-    src_u += 1;
-    src_v += 1;
-    dst_argb1555 += 4;  // Advance 2 pixels.
-  }
-}
-
-void NV12ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  int x;
-  uint32 tmp_ub = yuvconstants->kUVToB[0];
-  uint32 tmp_ug = yuvconstants->kUVToG[0];
-  uint32 tmp_vg = yuvconstants->kUVToG[1];
-  uint32 tmp_vr = yuvconstants->kUVToR[1];
-  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
-  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
-  uint32 tmp_br = yuvconstants->kUVBiasR[0];
-  uint32 yg = yuvconstants->kYToRgb[0];
-  uint32 tmp_mask = 0x7fff7fff;
-  uint32 tmp_yg;
-  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
-  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
-  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
-  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
-  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
-  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
-  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
-  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
-  yg = yg * 0x0101;
-
-  for (x = 0; x < width - 1; x += 2) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
-    __asm__ __volatile__(
-        ".set push                                             \n"
-        ".set noreorder                                        \n"
-        "lbu              %[tmp_t7], 0(%[src_y])               \n"
-        "lbu              %[tmp_t1], 1(%[src_y])               \n"
-        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
-        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
-        "lbu              %[tmp_t2], 0(%[src_uv])              \n"
-        "lbu              %[tmp_t3], 1(%[src_uv])              \n"
-        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
-        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
-        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
-        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
-        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
-        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
-        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
-        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
-        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
-        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
-        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
-        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
-        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
-        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
-        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
-        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
-        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
-        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
-        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
-        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
-        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
-        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
-        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
-        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
-        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
-        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
-        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
-        ".set pop                                              \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
-        : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
-          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
-          [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
-          [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
-          [tmp_mask] "r"(tmp_mask));
-
-    src_y += 2;
-    src_uv += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-}
-
-void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
-                       int src_stride_rgb,
-                       uint8* dst_u,
-                       uint8* dst_v,
-                       int width) {
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
-  int x;
-  int const1 = 0xffda0000;
-  int const2 = 0x0070ffb6;
-  int const3 = 0x00700000;
-  int const4 = 0xffeeffa2;
-  int const5 = 0x100;
-  for (x = 0; x < width - 1; x += 2) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                 \n"
-        ".set noreorder                                            \n"
-        "lw                %[tmp_t1],   0(%[src_rgb0])             \n"
-        "lw                %[tmp_t2],   4(%[src_rgb0])             \n"
-        "lw                %[tmp_t3],   0(%[src_rgb1])             \n"
-        "lw                %[tmp_t4],   4(%[src_rgb1])             \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                  \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                  \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                  \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                  \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                  \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                  \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                  \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                  \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]   \n"
-        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]   \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]   \n"
-        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]   \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]   \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]   \n"
-        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2           \n"
-        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2           \n"
-        "mult              $ac0,        %[const5],     %[const5]   \n"
-        "mult              $ac1,        %[const5],     %[const5]   \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]   \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]   \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]   \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]   \n"
-        "extr_r.w          %[tmp_t7],   $ac0,          9           \n"
-        "extr_r.w          %[tmp_t8],   $ac1,          9           \n"
-        "addiu             %[dst_u],    %[dst_u],    1             \n"
-        "addiu             %[dst_v],    %[dst_v],    1             \n"
-        "addiu             %[src_rgb0], %[src_rgb0], 8             \n"
-        "addiu             %[src_rgb1], %[src_rgb1], 8             \n"
-        "sb                %[tmp_t7],   -1(%[dst_u])               \n"
-        "sb                %[tmp_t8],   -1(%[dst_v])               \n"
-        ".set pop                                                  \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
-          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
-        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
-          [const4] "r"(const4), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi");
-  }
-}
-
-void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
-  int x;
-  int const1 = 0x00420000;
-  int const2 = 0x00190081;
-  int const5 = 0x40;
-  for (x = 0; x < width; x += 4) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
-        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
-        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
-        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "mult              $ac2,        %[const5],     %[const5]  \n"
-        "mult              $ac3,        %[const5],     %[const5]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
-        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
-        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
-        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
-        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
-        "addiu             %[src_argb0],%[src_argb0],  16         \n"
-        "addiu             %[dst_y],    %[dst_y],      4          \n"
-        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
-        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
-        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
-        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
-        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
-          "$ac3hi");
-  }
-}
-
-void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
-                       int src_stride_rgb,
-                       uint8* dst_u,
-                       uint8* dst_v,
-                       int width) {
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
-  int x;
-  int const1 = 0xffb6ffda;
-  int const2 = 0x00000070;
-  int const3 = 0xffa20070;
-  int const4 = 0x0000ffee;
-  int const5 = 0x100;
-
-  for (x = 0; x < width - 1; x += 2) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
-        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
-        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
-        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
-        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
-        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
-        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
-        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
-        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
-        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
-        "addiu             %[dst_u],    %[dst_u],    1            \n"
-        "addiu             %[dst_v],    %[dst_v],    1            \n"
-        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
-        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
-        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
-        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
-          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
-        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
-          [const4] "r"(const4), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi");
-  }
-}
-
-void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
-  int x;
-  int const1 = 0x00810019;
-  int const2 = 0x00000042;
-  int const5 = 0x40;
-  for (x = 0; x < width; x += 4) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
-        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
-        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
-        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "mult              $ac2,        %[const5],     %[const5]  \n"
-        "mult              $ac3,        %[const5],     %[const5]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
-        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
-        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
-        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
-        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
-        "addiu             %[dst_y],    %[dst_y],      4          \n"
-        "addiu             %[src_argb0],%[src_argb0],  16         \n"
-        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
-        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
-        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
-        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
-        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
-          "$ac3hi");
-  }
-}
-
-void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
-  int x;
-  int const1 = 0x00810042;
-  int const2 = 0x00000019;
-  int const5 = 0x40;
-  for (x = 0; x < width; x += 4) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
-        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
-        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
-        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "mult              $ac2,        %[const5],     %[const5]  \n"
-        "mult              $ac3,        %[const5],     %[const5]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
-        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
-        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
-        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
-        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
-        "addiu             %[src_argb0],%[src_argb0],  16         \n"
-        "addiu             %[dst_y],    %[dst_y],      4          \n"
-        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
-        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
-        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
-        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
-        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
-          "$ac3hi");
-  }
-}
-
-void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
-                       int src_stride_rgb,
-                       uint8* dst_u,
-                       uint8* dst_v,
-                       int width) {
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
-  int x;
-  int const1 = 0xffb60070;
-  int const2 = 0x0000ffda;
-  int const3 = 0xffa2ffee;
-  int const4 = 0x00000070;
-  int const5 = 0x100;
-
-  for (x = 0; x < width - 1; x += 2) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "ulw               %[tmp_t1],   0+1(%[src_rgb0])          \n"
-        "ulw               %[tmp_t2],   4+1(%[src_rgb0])          \n"
-        "ulw               %[tmp_t3],   0+1(%[src_rgb1])          \n"
-        "ulw               %[tmp_t4],   4+1(%[src_rgb1])          \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
-        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
-        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
-        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
-        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
-        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
-        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
-        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
-        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
-        "addiu             %[dst_u],    %[dst_u],    1            \n"
-        "addiu             %[dst_v],    %[dst_v],    1            \n"
-        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
-        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
-          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
-        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
-          [const4] "r"(const4), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi");
-  }
-}
-
-void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
-  int x;
-  int const1 = 0x00420081;
-  int const2 = 0x00190000;
-  int const5 = 0x40;
-  for (x = 0; x < width; x += 4) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
-        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
-        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
-        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
-        "preceu.ph.qbl     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbr     %[tmp_t4],   %[tmp_t4]                 \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "mult              $ac2,        %[const5],     %[const5]  \n"
-        "mult              $ac3,        %[const5],     %[const5]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
-        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
-        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
-        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
-        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
-        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
-        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
-        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
-        "addiu             %[dst_y],    %[dst_y],      4          \n"
-        "addiu             %[src_argb0],%[src_argb0],  16         \n"
-        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
-        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
-        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
-        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
-        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
-          "$ac3hi");
-  }
-}
-
-void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
-                       int src_stride_rgb,
-                       uint8* dst_u,
-                       uint8* dst_v,
-                       int width) {
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
-  int x;
-  int const1 = 0xffb60070;
-  int const2 = 0x0000ffda;
-  int const3 = 0xffa2ffee;
-  int const4 = 0x00000070;
-  int const5 = 0x100;
-
-  for (x = 0; x < width - 1; x += 2) {
-    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
-    int tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
-        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
-        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
-        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
-        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
-        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
-        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
-        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
-        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
-        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
-        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
-        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
-        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
-        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
-        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
-        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
-        "mult              $ac0,        %[const5],     %[const5]  \n"
-        "mult              $ac1,        %[const5],     %[const5]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
-        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
-        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
-        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
-        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
-        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
-        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
-        "addiu             %[dst_u],    %[dst_u],    1            \n"
-        "addiu             %[dst_v],    %[dst_v],    1            \n"
-        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
-        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
-        ".set pop                                                 \n"
-        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
-          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
-          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
-          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
-          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
-        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
-          [const4] "r"(const4), [const5] "r"(const5)
-        : "hi", "lo", "$ac1lo", "$ac1hi");
-  }
-}
-
-#endif  // __mips_dsp_rev >= 2
-
-#endif  // defined(__mips__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
deleted file mode 100644
index decd3d2e..00000000
--- a/files/source/row_gcc.cc
+++ /dev/null
@@ -1,6798 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
-                              13, 65, 33, 0, 13, 65, 33, 0};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
-                               15, 75, 38, 0, 15, 75, 38, 0};
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
-                              112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
-                               127, -84, -43, 0, 127, -84, -43, 0};
-
-static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-                              -18, -94, 112, 0, -18, -94, 112, 0};
-
-static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-                               -20, -107, 127, 0, -20, -107, 127, 0};
-
-// Constants for BGRA
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
-                              0, 33, 65, 13, 0, 33, 65, 13};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
-                              0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
-                              0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
-                              33, 65, 13, 0, 33, 65, 13, 0};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-                              -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
-                              112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
-                              0, 13, 65, 33, 0, 13, 65, 33};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
-                              0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
-                              0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
-    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
-                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
-    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
-                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
-                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
-                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
-                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
-                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
-                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
-                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
-                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm0,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm1                   \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_J400TOARGBROW_SSE2
-
-#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
-      "pslld     $0x18,%%xmm5                    \n"
-      "movdqa    %3,%%xmm4                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm3                 \n"
-      "lea       0x30(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "palignr   $0x8,%%xmm1,%%xmm2              \n"
-      "pshufb    %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "palignr   $0xc,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "palignr   $0x4,%%xmm3,%%xmm3              \n"
-      "pshufb    %%xmm4,%%xmm3                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm3,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_rgb24),              // %0
-        "+r"(dst_argb),               // %1
-        "+r"(width)                   // %2
-      : "m"(kShuffleMaskRGB24ToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
-      "pslld     $0x18,%%xmm5                    \n"
-      "movdqa    %3,%%xmm4                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm3                 \n"
-      "lea       0x30(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "palignr   $0x8,%%xmm1,%%xmm2              \n"
-      "pshufb    %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "palignr   $0xc,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "palignr   $0x4,%%xmm3,%%xmm3              \n"
-      "pshufb    %%xmm4,%%xmm3                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm3,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_raw),              // %0
-        "+r"(dst_argb),             // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskRAWToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "movdqa     %3,%%xmm3                       \n"
-      "movdqa     %4,%%xmm4                       \n"
-      "movdqa     %5,%%xmm5                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x4(%0),%%xmm1                  \n"
-      "movdqu    0x8(%0),%%xmm2                  \n"
-      "lea       0x18(%0),%0                     \n"
-      "pshufb    %%xmm3,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq      %%xmm1,0x8(%1)                  \n"
-      "movq      %%xmm2,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_raw),                  // %0
-        "+r"(dst_rgb24),                // %1
-        "+r"(width)                     // %2
-      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-        "m"(kShuffleMaskRAWToRGB24_1),  // %4
-        "m"(kShuffleMaskRAWToRGB24_2)   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov       $0x1080108,%%eax                \n"
-      "movd      %%eax,%%xmm5                    \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x20802080,%%eax               \n"
-      "movd      %%eax,%%xmm6                    \n"
-      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psllw     $0xb,%%xmm3                     \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0xa,%%xmm4                     \n"
-      "psrlw     $0x5,%%xmm4                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psllw     $0x8,%%xmm7                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "psllw     $0xb,%%xmm2                     \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "psllw     $0x8,%%xmm1                     \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "pmulhuw   %%xmm6,%%xmm0                   \n"
-      "por       %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm1                   \n"
-      "punpckhbw %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-        "xmm6", "xmm7");
-}
-
-void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov       $0x1080108,%%eax                \n"
-      "movd      %%eax,%%xmm5                    \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x42004200,%%eax               \n"
-      "movd      %%eax,%%xmm6                    \n"
-      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psllw     $0xb,%%xmm3                     \n"
-      "movdqa    %%xmm3,%%xmm4                   \n"
-      "psrlw     $0x6,%%xmm4                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psllw     $0x8,%%xmm7                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "psllw     $0x1,%%xmm1                     \n"
-      "psllw     $0xb,%%xmm2                     \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "psllw     $0x8,%%xmm1                     \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "pmulhuw   %%xmm6,%%xmm0                   \n"
-      "pand      %%xmm7,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm1                   \n"
-      "punpckhbw %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-        "xmm6", "xmm7");
-}
-
-void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov       $0xf0f0f0f,%%eax                \n"
-      "movd      %%eax,%%xmm4                    \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "pslld     $0x4,%%xmm5                     \n"
-      "sub       %0,%1                           \n"
-      "sub       %0,%1                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "psllw     $0x4,%%xmm1                     \n"
-      "psrlw     $0x4,%%xmm3                     \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm2,%%xmm0                   \n"
-      "punpckhbw %%xmm2,%%xmm1                   \n"
-      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
-      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      "movdqa    %3,%%xmm6                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "pshufb    %%xmm6,%%xmm0                   \n"
-      "pshufb    %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm6,%%xmm2                   \n"
-      "pshufb    %%xmm6,%%xmm3                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "psrldq    $0x4,%%xmm1                     \n"
-      "pslldq    $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm2,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pslldq    $0x8,%%xmm5                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "psrldq    $0x8,%%xmm2                     \n"
-      "pslldq    $0x4,%%xmm3                     \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "lea       0x30(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "m"(kShuffleMaskARGBToRGB24)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      "movdqa    %3,%%xmm6                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "pshufb    %%xmm6,%%xmm0                   \n"
-      "pshufb    %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm6,%%xmm2                   \n"
-      "pshufb    %%xmm6,%%xmm3                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "psrldq    $0x4,%%xmm1                     \n"
-      "pslldq    $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm2,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pslldq    $0x8,%%xmm5                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "psrldq    $0x8,%%xmm2                     \n"
-      "pslldq    $0x4,%%xmm3                     \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "movdqu    %%xmm2,0x20(%1)                 \n"
-      "lea       0x30(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskARGBToRAW)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTORGB24ROW_AVX2
-// vpermd for 12+12 to 24
-static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
-
-void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa    %4,%%ymm7                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
-      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
-      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
-      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
-      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
-      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
-      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
-      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
-      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
-      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                     // %0
-        "+r"(dst),                     // %1
-        "+r"(width)                    // %2
-      : "m"(kShuffleMaskARGBToRGB24),  // %3
-        "m"(kPermdRGB24_AVX)           // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
-// Shuffle table for converting ARGBToRGB24
-static const ulvec8 kPermARGBToRGB24_0 = {
-    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
-    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
-    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
-static const ulvec8 kPermARGBToRGB24_1 = {
-    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
-    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
-    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
-static const ulvec8 kPermARGBToRGB24_2 = {
-    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
-    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
-    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
-
-void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vmovdqa    %3,%%ymm5                      \n"
-      "vmovdqa    %4,%%ymm6                      \n"
-      "vmovdqa    %5,%%ymm7                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"
-      "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"
-      "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                // %0
-        "+r"(dst),                // %1
-        "+r"(width)               // %2
-      : "m"(kPermARGBToRGB24_0),  // %3
-        "m"(kPermARGBToRGB24_1),  // %4
-        "m"(kPermARGBToRGB24_2)   // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
-}
-#endif
-
-#ifdef HAS_ARGBTORAWROW_AVX2
-void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
-      "vmovdqa    %4,%%ymm7                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "lea        0x80(%0),%0                    \n"
-      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
-      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
-      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
-      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
-      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
-      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
-      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
-      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
-      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
-      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
-      "vmovdqu    %%ymm2,0x40(%1)                \n"
-      "lea        0x60(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                   // %0
-        "+r"(dst),                   // %1
-        "+r"(width)                  // %2
-      : "m"(kShuffleMaskARGBToRAW),  // %3
-        "m"(kPermdRGB24_AVX)         // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif
-
-void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "psrld     $0x1b,%%xmm3                    \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psrld     $0x1a,%%xmm4                    \n"
-      "pslld     $0x5,%%xmm4                     \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0xb,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "pslld     $0x8,%%xmm0                     \n"
-      "psrld     $0x3,%%xmm1                     \n"
-      "psrld     $0x5,%%xmm2                     \n"
-      "psrad     $0x10,%%xmm0                    \n"
-      "pand      %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm4,%%xmm2                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "por       %%xmm2,%%xmm1                   \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "movd       %3,%%xmm6                      \n"
-      "punpcklbw  %%xmm6,%%xmm6                  \n"
-      "movdqa     %%xmm6,%%xmm7                  \n"
-      "punpcklwd  %%xmm6,%%xmm6                  \n"
-      "punpckhwd  %%xmm7,%%xmm7                  \n"
-      "pcmpeqb    %%xmm3,%%xmm3                  \n"
-      "psrld      $0x1b,%%xmm3                   \n"
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrld      $0x1a,%%xmm4                   \n"
-      "pslld      $0x5,%%xmm4                    \n"
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "pslld      $0xb,%%xmm5                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "paddusb    %%xmm6,%%xmm0                  \n"
-      "movdqa     %%xmm0,%%xmm1                  \n"
-      "movdqa     %%xmm0,%%xmm2                  \n"
-      "pslld      $0x8,%%xmm0                    \n"
-      "psrld      $0x3,%%xmm1                    \n"
-      "psrld      $0x5,%%xmm2                    \n"
-      "psrad      $0x10,%%xmm0                   \n"
-      "pand       %%xmm3,%%xmm1                  \n"
-      "pand       %%xmm4,%%xmm2                  \n"
-      "pand       %%xmm5,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm1                  \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "packssdw   %%xmm0,%%xmm0                  \n"
-      "lea        0x10(%0),%0                    \n"
-      "movq       %%xmm0,(%1)                    \n"
-      "lea        0x8(%1),%1                     \n"
-      "sub        $0x4,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src),    // %0
-        "+r"(dst),    // %1
-        "+r"(width)   // %2
-      : "m"(dither4)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
-                                uint8_t* dst,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "vbroadcastss %3,%%xmm6                    \n"
-      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "lea        0x20(%0),%0                    \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src),    // %0
-        "+r"(dst),    // %1
-        "+r"(width)   // %2
-      : "m"(dither4)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psrld     $0x1b,%%xmm4                    \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "pslld     $0x5,%%xmm5                     \n"
-      "movdqa    %%xmm4,%%xmm6                   \n"
-      "pslld     $0xa,%%xmm6                     \n"
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "pslld     $0xf,%%xmm7                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "psrad     $0x10,%%xmm0                    \n"
-      "psrld     $0x3,%%xmm1                     \n"
-      "psrld     $0x6,%%xmm2                     \n"
-      "psrld     $0x9,%%xmm3                     \n"
-      "pand      %%xmm7,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm2                   \n"
-      "pand      %%xmm6,%%xmm3                   \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "por       %%xmm3,%%xmm2                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
-}
-
-void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0xc,%%xmm4                     \n"
-      "movdqa    %%xmm4,%%xmm3                   \n"
-      "psrlw     $0x8,%%xmm3                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm3,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm1                   \n"
-      "psrlq     $0x4,%%xmm0                     \n"
-      "psrlq     $0x8,%%xmm1                     \n"
-      "por       %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_RGB24TOARGBROW_SSSE3
-
-/*
-
-ARGBToAR30Row:
-
-Red Blue
-With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
-produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
-wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
-(1024+4)*16 for red.
-
-Alpha Green
-Alpha and Green are already in the high bits so vpand can zero out the other
-bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
-could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
-would be a simple multiplier to shift it into position.  It wants a gap of 10
-above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
-more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
-and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
-result left 10 to position the A and G channels.
-*/
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
-                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
-
-static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
-                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
-
-static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
-static const uint32_t kMaskRB10 = 0x3ff003ff;
-static const uint32_t kMaskAG10 = 0xc000ff00;
-static const uint32_t kMulAG10 = 64 * 65536 + 1028;
-
-void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
-      "movd       %4,%%xmm3                     \n"  // multipler for RB
-      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd       %6,%%xmm5                     \n"  // mask for AG
-      "movd       %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
-      "sub        %0,%1                         \n"
-
-      "1:                                       \n"
-      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
-      "movdqa     %%xmm0,%%xmm1                 \n"
-      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add        $0x10,%0                      \n"
-      "sub        $0x4,%2                       \n"
-      "jg         1b                            \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleRB30),  // %3
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
-      "movd       %4,%%xmm3                     \n"  // multipler for RB
-      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
-      "movd       %6,%%xmm5                     \n"  // mask for AG
-      "movd       %7,%%xmm6                     \n"  // multipler for AG
-      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
-      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
-      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
-      "sub        %0,%1                         \n"
-
-      "1:                                       \n"
-      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
-      "movdqa     %%xmm0,%%xmm1                 \n"
-      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
-      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
-      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
-      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
-      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
-      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
-      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
-      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
-      "add        $0x10,%0                      \n"
-      "sub        $0x4,%2                       \n"
-      "jg         1b                            \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleBR30),  // %3  reversed shuffler
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#ifdef HAS_ARGBTOAR30ROW_AVX2
-void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
-      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
-      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
-      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
-      "sub        %0,%1                          \n"
-
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
-      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
-      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
-      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
-      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
-      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
-      "add        $0x20,%0                       \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleRB30),  // %3
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_ABGRTOAR30ROW_AVX2
-void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
-      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
-      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
-      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
-      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
-      "sub        %0,%1                          \n"
-
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
-      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
-      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
-      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
-      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
-      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
-      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
-      "add        $0x20,%0                       \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-
-      : "+r"(src),          // %0
-        "+r"(dst),          // %1
-        "+r"(width)         // %2
-      : "m"(kShuffleBR30),  // %3  reversed shuffler
-        "m"(kMulRB10),      // %4
-        "m"(kMaskRB10),     // %5
-        "m"(kMaskAG10),     // %6
-        "m"(kMulAG10)       // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_ARGBTOYJROW_SSSE3
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToYJ),  // %3
-        "m"(kAddYJ64)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBTOYJROW_SSSE3
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu    %5,%%ymm6                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "lea       0x80(%0),%0                     \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kARGBToY),          // %3
-        "m"(kAddY16),           // %4
-        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-      "vmovdqu    %5,%%ymm6                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "lea       0x80(%0),%0                     \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),         // %0
-        "+r"(dst_y),            // %1
-        "+r"(width)             // %2
-      : "m"(kARGBToYJ),         // %3
-        "m"(kAddYJ64),          // %4
-        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBTOYJROW_AVX2
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kARGBToV),                     // %5
-        "m"(kARGBToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub        %1,%2                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-      "lea        0x80(%0),%0                    \n"
-      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpshufb    %8,%%ymm0,%%ymm0               \n"
-      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
-
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x20,%3                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUV128),                    // %5
-        "m"(kARGBToV),                     // %6
-        "m"(kARGBToU),                     // %7
-        "m"(kShufARGBToUV_AVX)             // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
-      "sub        %1,%2                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x40(%0),%%ymm2                \n"
-      "vmovdqu    0x60(%0),%%ymm3                \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-      "lea       0x80(%0),%0                     \n"
-      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpshufb    %8,%%ymm0,%%ymm0               \n"
-
-      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUVJ128),                   // %5
-        "m"(kARGBToVJ),                    // %6
-        "m"(kARGBToUJ),                    // %7
-        "m"(kShufARGBToUV_AVX)             // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
-                        int src_stride_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kARGBToVJ),                    // %5
-        "m"(kARGBToUJ),                    // %6
-        "m"(kAddUVJ128)                    // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-#endif  // HAS_ARGBTOUVJROW_SSSE3
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "movdqa    %4,%%xmm3                       \n"
-      "movdqa    %5,%%xmm4                       \n"
-      "movdqa    %6,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm6                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm2                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "packsswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm3,%%xmm0                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm2                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm2                     \n"
-      "packsswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+rm"(width)     // %3
-      : "m"(kARGBToV),   // %4
-        "m"(kARGBToU),   // %5
-        "m"(kAddUV128)   // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
-}
-#endif  // HAS_ARGBTOUV444ROW_SSSE3
-
-void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kBGRAToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
-                       int src_stride_bgra,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_bgra0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_bgra)),  // %4
-        "m"(kBGRAToV),                     // %5
-        "m"(kBGRAToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kABGRToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa    %4,%%xmm5                       \n"
-      "movdqa    %3,%%xmm4                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm4,%%xmm3                   \n"
-      "lea       0x40(%0),%0                     \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "phaddw    %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "m"(kRGBAToY),   // %3
-        "m"(kAddY16)     // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
-                       int src_stride_abgr,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_abgr0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_abgr)),  // %4
-        "m"(kABGRToV),                     // %5
-        "m"(kABGRToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
-                       int src_stride_rgba,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile(
-      "movdqa    %5,%%xmm3                       \n"
-      "movdqa    %6,%%xmm4                       \n"
-      "movdqa    %7,%%xmm5                       \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm1                   \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqu    0x30(%0),%%xmm6                 \n"
-      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-
-      "lea       0x40(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm7                   \n"
-      "shufps    $0x88,%%xmm6,%%xmm2             \n"
-      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm2                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "phaddw    %%xmm2,%%xmm0                   \n"
-      "phaddw    %%xmm6,%%xmm1                   \n"
-      "psraw     $0x8,%%xmm0                     \n"
-      "psraw     $0x8,%%xmm1                     \n"
-      "packsswb  %%xmm1,%%xmm0                   \n"
-      "paddb     %%xmm5,%%xmm0                   \n"
-      "movlps    %%xmm0,(%1)                     \n"
-      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_rgba0),                   // %0
-        "+r"(dst_u),                       // %1
-        "+r"(dst_v),                       // %2
-        "+rm"(width)                       // %3
-      : "r"((intptr_t)(src_stride_rgba)),  // %4
-        "m"(kRGBAToV),                     // %5
-        "m"(kRGBAToU),                     // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-
-// Read 8 UV from 444
-#define READYUV444                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
-  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 UV from 422 10 bit, upsample to 8 UV
-// TODO(fbarchard): Consider shufb to replace pack/unpack
-// TODO(fbarchard): Consider pmulhuw to replace psraw
-// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
-#define READYUV210                                                \
-  "movq       (%[u_buf]),%%xmm0                               \n" \
-  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
-  "psraw      $0x2,%%xmm0                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
-  "movdqu     (%[y_buf]),%%xmm4                               \n" \
-  "psllw      $0x6,%%xmm4                                     \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                               \
-  "movd       (%[u_buf]),%%xmm0                               \n" \
-  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
-  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
-  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
-  "movq       (%[a_buf]),%%xmm5                               \n" \
-  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
-
-// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                  \
-  "movq       (%[uv_buf]),%%xmm0                              \n" \
-  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
-  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                  \
-  "movq       (%[vu_buf]),%%xmm0                              \n" \
-  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
-  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
-  "movq       (%[y_buf]),%%xmm4                               \n" \
-  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
-  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                  \
-  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
-  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
-  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
-
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                  \
-  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
-  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
-  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
-  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                              \
-  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
-  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
-  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
-  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     %%xmm11,%%xmm0                                  \n" \
-  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     %%xmm12,%%xmm1                                  \n" \
-  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     %%xmm13,%%xmm2                                  \n" \
-  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
-  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
-#define YUVTORGB_REGS \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-
-#else
-#define YUVTORGB_SETUP(yuvconstants)
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm0,%%xmm1                                   \n" \
-  "movdqa     %%xmm0,%%xmm2                                   \n" \
-  "movdqa     %%xmm0,%%xmm3                                   \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
-  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
-  "psubw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
-  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
-  "psubw      %%xmm2,%%xmm1                                   \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
-  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
-  "psubw      %%xmm3,%%xmm2                                   \n" \
-  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
-  "paddsw     %%xmm4,%%xmm0                                   \n" \
-  "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n"
-#define YUVTORGB_REGS
-#endif
-
-#define YUVTORGB(yuvconstants)                                    \
-  YUVTORGB16(yuvconstants)                                        \
-  "psraw      $0x6,%%xmm0                                     \n" \
-  "psraw      $0x6,%%xmm1                                     \n" \
-  "psraw      $0x6,%%xmm2                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "packuswb   %%xmm1,%%xmm1                                   \n" \
-  "packuswb   %%xmm2,%%xmm2                                   \n"
-
-// Store 8 ARGB values.
-#define STOREARGB                                                  \
-  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
-  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
-  "movdqa     %%xmm0,%%xmm1                                    \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
-  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
-  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
-  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
-  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
-
-// Store 8 RGBA values.
-#define STORERGBA                                                  \
-  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
-  "punpcklbw %%xmm2,%%xmm1                                     \n" \
-  "punpcklbw %%xmm0,%%xmm5                                     \n" \
-  "movdqa    %%xmm5,%%xmm0                                     \n" \
-  "punpcklwd %%xmm1,%%xmm5                                     \n" \
-  "punpckhwd %%xmm1,%%xmm0                                     \n" \
-  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
-  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
-  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
-
-// Store 8 AR30 values.
-#define STOREAR30                                                  \
-  "psraw      $0x4,%%xmm0                                      \n" \
-  "psraw      $0x4,%%xmm1                                      \n" \
-  "psraw      $0x4,%%xmm2                                      \n" \
-  "pminsw     %%xmm7,%%xmm0                                    \n" \
-  "pminsw     %%xmm7,%%xmm1                                    \n" \
-  "pminsw     %%xmm7,%%xmm2                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
-  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
-  "psllw      $0x4,%%xmm2                                      \n" \
-  "movdqa     %%xmm0,%%xmm3                                    \n" \
-  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
-  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
-  "movdqa     %%xmm1,%%xmm2                                    \n" \
-  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
-  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
-  "pslld      $0xa,%%xmm1                                      \n" \
-  "pslld      $0xa,%%xmm2                                      \n" \
-  "por        %%xmm1,%%xmm0                                    \n" \
-  "por        %%xmm2,%%xmm3                                    \n" \
-  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
-  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
-  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 uint8_t* dst_rgb24,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-    "sub       %[u_buf],%[v_buf]               \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_rgb24])           \n"
-    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
-    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
-    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
-}
-
-void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
-    "psrlw     $14,%%xmm5                      \n"
-    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
-    "pxor      %%xmm6,%%xmm6                   \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
-    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-// 10 bit YUV to ARGB
-void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV210
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-// 10 bit YUV to AR30
-void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
-                                const uint16_t* u_buf,
-                                const uint16_t* v_buf,
-                                uint8_t* dst_ar30,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $14,%%xmm5                      \n"
-    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
-    "pxor      %%xmm6,%%xmm6                   \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
-    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV210
-    YUVTORGB16(yuvconstants)
-    STOREAR30
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                                     const uint8_t* u_buf,
-                                     const uint8_t* v_buf,
-                                     const uint8_t* a_buf,
-                                     uint8_t* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I422ALPHATOARGBROW_SSSE3
-
-void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* uv_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READNV12
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* vu_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READNV21
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUY2
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
-                                uint8_t* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READUYVY
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-
-void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_rgba,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STORERGBA
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2                                               \
-  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                               \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
-  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 UV from 210 10 bit, upsample to 16 UV
-// TODO(fbarchard): Consider vshufb to replace pack/unpack
-// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
-#define READYUV210_AVX2                                            \
-  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
-  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
-  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
-  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
-  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
-  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
-  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
-  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
-  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                              \
-  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
-  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
-  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
-  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
-  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
-  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                 \
-  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
-  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                 \
-  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
-  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
-  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
-  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
-  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
-  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                 \
-  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
-  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
-  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                 \
-  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
-  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
-  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
-  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
-  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
-  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
-  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
-  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
-  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
-  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
-  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
-
-#define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
-  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
-  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
-  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
-  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
-  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
-  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
-
-#define YUVTORGB_REGS_AVX2 \
-  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-
-#else  // Convert 16 pixels: 16 UV and 16 Y.
-
-#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
-  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
-  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
-  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
-  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
-  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
-  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
-  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
-  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
-  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
-  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
-#define YUVTORGB_REGS_AVX2
-#endif
-
-#define YUVTORGB_AVX2(yuvconstants)                                   \
-  YUVTORGB16_AVX2(yuvconstants)                                       \
-  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
-  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
-  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
-  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
-  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
-  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2                                                \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
-  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
-  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
-  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
-  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
-
-// Store 16 AR30 values.
-#define STOREAR30_AVX2                                                \
-  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
-  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
-  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
-  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
-  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
-  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
-  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
-  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
-  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
-  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
-  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
-  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
-  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
-  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
-  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
-  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
-  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
-  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
-  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
-  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV444_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#if defined(HAS_I422TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
-    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
-    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
-    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
-    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
-    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-}
-#endif  // HAS_I422TOAR30ROW_AVX2
-
-#if defined(HAS_I210TOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV210_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I210TOARGBROW_AVX2
-
-#if defined(HAS_I210TOAR30ROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
-void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
-                               const uint16_t* u_buf,
-                               const uint16_t* v_buf,
-                               uint8_t* dst_ar30,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
-    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
-    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
-    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
-    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
-    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV210_AVX2
-    YUVTORGB16_AVX2(yuvconstants)
-    STOREAR30_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I210TOAR30ROW_AVX2
-
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
-                                    const uint8_t* u_buf,
-                                    const uint8_t* v_buf,
-                                    const uint8_t* a_buf,
-                                    uint8_t* dst_argb,
-                                    const struct YuvConstants* yuvconstants,
-                                    int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "subl      $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__)
-    [width]"+m"(width)     // %[width]
-#else
-    [width]"+rm"(width)    // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#if defined(HAS_I422TORGBAROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* u_buf,
-                               const uint8_t* v_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-
-    // Step 3: Weave into RGBA
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
-    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
-    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
-    "sub        $0x10,%[width]                 \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_NV12TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* uv_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READNV12_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#if defined(HAS_NV21TOARGBROW_AVX2)
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
-                               const uint8_t* vu_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READNV21_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READYUY2_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
-                               uint8_t* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-    "1:                                        \n"
-    READUYVY_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-      "movd      %%eax,%%xmm2                    \n"
-      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
-                                                      // 16
-      "movd      %%eax,%%xmm3                    \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "pslld     $0x18,%%xmm4                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "psubusw   %%xmm3,%%xmm0                   \n"
-      "psrlw     $6, %%xmm0                      \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-
-      // Step 2: Weave into ARGB
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm0,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm1                   \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "por       %%xmm4,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(y_buf),     // %0
-        "+r"(dst_argb),  // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
-                                                      // 16
-      "vmovd      %%eax,%%xmm2                   \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
-      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
-      "vmovd      %%eax,%%xmm3                   \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-      "vmovdqu    (%0),%%xmm0                    \n"
-      "lea        0x10(%0),%0                    \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub        $0x10,%2                       \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(y_buf),     // %0
-        "+r"(dst_argb),  // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
-
-void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "movdqa    %3,%%xmm5                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
-      "pshufb    %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "vbroadcastf128 %3,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
-      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-      "movdqa    %4,%%xmm1                       \n"
-      "lea       -0x10(%0,%3,2),%0               \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       -0x10(%0),%0                    \n"
-      "pshufb    %%xmm1,%%xmm0                   \n"
-      "movlpd    %%xmm0,(%1)                     \n"
-      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $8,%3                           \n"
-      "jg        1b                              \n"
-      : "+r"(src),             // %0
-        "+r"(dst_u),           // %1
-        "+r"(dst_v),           // %2
-        "+r"(temp_width)       // %3
-      : "m"(kShuffleMirrorUV)  // %4
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "lea       -0x10(%0,%2,4),%0               \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-      "lea       -0x10(%0),%0                    \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),        // %0
-        "+r"(dst),        // %1
-        "+r"(temp_width)  // %2
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-
-      "vmovdqu    %3,%%ymm5                      \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x8,%2                        \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(temp_width)              // %2
-      : "m"(kARGBShuffleMirror_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-      "sub        %1,%2                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%3                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "psrlw      $0x8,%%xmm5                    \n"
-      "sub        %1,%2                          \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "lea        0x20(%0),%0                    \n"
-      "movdqa     %%xmm0,%%xmm2                  \n"
-      "movdqa     %%xmm1,%%xmm3                  \n"
-      "pand       %%xmm5,%%xmm0                  \n"
-      "pand       %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "psrlw      $0x8,%%xmm2                    \n"
-      "psrlw      $0x8,%%xmm3                    \n"
-      "packuswb   %%xmm3,%%xmm2                  \n"
-      "movdqu     %%xmm0,(%1)                    \n"
-      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%3                       \n"
-      "jg         1b                             \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-
-      "sub       %0,%1                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
-      "lea       0x20(%0),%0                     \n"
-      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
-      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
-      "vextractf128 $0x0,%%ymm2,(%2)             \n"
-      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-
-      "sub       %0,%1                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "punpcklbw %%xmm1,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm2                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "movdqu    %%xmm2,0x10(%2)                 \n"
-      "lea       0x20(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGEUVROW_SSE2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
-#ifdef HAS_MERGEUVROW_16_AVX2
-void MergeUVRow_16_AVX2(const uint16_t* src_u,
-                        const uint16_t* src_v,
-                        uint16_t* dst_uv,
-                        int scale,
-                        int width) {
-  // clang-format off
-  asm volatile (
-    "vmovd      %4,%%xmm3                      \n"
-    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub       %0,%1                           \n"
-
-    // 16 pixels per loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   (%0,%1,1),%%ymm1                \n"
-    "add        $0x20,%0                       \n"
-
-    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
-    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
-    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
-    "vextractf128 $0x0,%%ymm2,(%2)             \n"
-    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-    "add       $0x40,%2                        \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_u),   // %0
-    "+r"(src_v),   // %1
-    "+r"(dst_uv),  // %2
-    "+r"(width)    // %3
-  : "r"(scale)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-  // clang-format on
-}
-#endif  // HAS_MERGEUVROW_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
-#ifdef HAS_MULTIPLYROW_16_AVX2
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width) {
-  // clang-format off
-  asm volatile (
-    "vmovd      %3,%%xmm3                      \n"
-    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub       %0,%1                           \n"
-
-    // 16 pixels per loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
-    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0,(%0,%1)                  \n"
-    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
-    "add        $0x40,%0                       \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm3");
-  // clang-format on
-}
-#endif  // HAS_MULTIPLYROW_16_AVX2
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-void Convert16To8Row_SSSE3(const uint16_t* src_y,
-                           uint8_t* dst_y,
-                           int scale,
-                           int width) {
-  // clang-format off
-  asm volatile (
-    "movd      %3,%%xmm2                      \n"
-    "punpcklwd %%xmm2,%%xmm2                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-    "1:                                       \n"
-    "movdqu    (%0),%%xmm0                    \n"
-    "movdqu    0x10(%0),%%xmm1                \n"
-    "add       $0x20,%0                       \n"
-    "pmulhuw   %%xmm2,%%xmm0                  \n"
-    "pmulhuw   %%xmm2,%%xmm1                  \n"
-    "packuswb  %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%1)                    \n"
-    "add       $0x10,%1                       \n"
-    "sub       $0x10,%2                       \n"
-    "jg        1b                             \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-
-#ifdef HAS_CONVERT16TO8ROW_AVX2
-void Convert16To8Row_AVX2(const uint16_t* src_y,
-                          uint8_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-    "vmovd      %3,%%xmm2                      \n"
-    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "add       $0x40,%0                        \n"
-    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
-    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "add       $0x20,%1                        \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-#endif  // HAS_CONVERT16TO8ROW_AVX2
-
-// Use scale to convert to lsb formats depending how many bits there are:
-// 512 = 9 bits
-// 1024 = 10 bits
-// 4096 = 12 bits
-// TODO(fbarchard): reduce to SSE2
-void Convert8To16Row_SSE2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-    "movd      %3,%%xmm2                      \n"
-    "punpcklwd %%xmm2,%%xmm2                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-    "1:                                       \n"
-    "movdqu    (%0),%%xmm0                    \n"
-    "movdqa    %%xmm0,%%xmm1                  \n"
-    "punpcklbw %%xmm0,%%xmm0                  \n"
-    "punpckhbw %%xmm1,%%xmm1                  \n"
-    "add       $0x10,%0                       \n"
-    "pmulhuw   %%xmm2,%%xmm0                  \n"
-    "pmulhuw   %%xmm2,%%xmm1                  \n"
-    "movdqu    %%xmm0,(%1)                    \n"
-    "movdqu    %%xmm1,0x10(%1)                \n"
-    "add       $0x20,%1                       \n"
-    "sub       $0x10,%2                       \n"
-    "jg        1b                             \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-
-#ifdef HAS_CONVERT8TO16ROW_AVX2
-void Convert8To16Row_AVX2(const uint8_t* src_y,
-                          uint16_t* dst_y,
-                          int scale,
-                          int width) {
-  // clang-format off
-  asm volatile (
-    "vmovd      %3,%%xmm2                      \n"
-    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-
-    // 32 pixels per loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "add       $0x20,%0                        \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
-    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "vmovdqu   %%ymm1,0x20(%1)                 \n"
-    "add       $0x40,%1                        \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
-}
-#endif  // HAS_CONVERT8TO16ROW_AVX2
-
-#ifdef HAS_SPLITRGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u};
-
-static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
-                                          3u,   6u,   9u,   12u,  15u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 2u,
-                                          5u,   8u,   11u,  14u};
-
-static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u,  128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 0u,   3u,
-                                          6u,   9u,   12u,  15u};
-
-void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
-                       uint8_t* dst_r,
-                       uint8_t* dst_g,
-                       uint8_t* dst_b,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %5, %%xmm0                     \n"
-      "pshufb     %6, %%xmm1                     \n"
-      "pshufb     %7, %%xmm2                     \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %8, %%xmm0                     \n"
-      "pshufb     %9, %%xmm1                     \n"
-      "pshufb     %10, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%2)                    \n"
-      "lea        0x10(%2),%2                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     0x10(%0),%%xmm1                \n"
-      "movdqu     0x20(%0),%%xmm2                \n"
-      "pshufb     %11, %%xmm0                    \n"
-      "pshufb     %12, %%xmm1                    \n"
-      "pshufb     %13, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%3)                    \n"
-      "lea        0x10(%3),%3                    \n"
-      "lea        0x30(%0),%0                    \n"
-      "sub        $0x10,%4                       \n"
-      "jg         1b                             \n"
-      : "+r"(src_rgb),             // %0
-        "+r"(dst_r),               // %1
-        "+r"(dst_g),               // %2
-        "+r"(dst_b),               // %3
-        "+r"(width)                // %4
-      : "m"(kShuffleMaskRGBToR0),  // %5
-        "m"(kShuffleMaskRGBToR1),  // %6
-        "m"(kShuffleMaskRGBToR2),  // %7
-        "m"(kShuffleMaskRGBToG0),  // %8
-        "m"(kShuffleMaskRGBToG1),  // %9
-        "m"(kShuffleMaskRGBToG2),  // %10
-        "m"(kShuffleMaskRGBToB0),  // %11
-        "m"(kShuffleMaskRGBToB1),  // %12
-        "m"(kShuffleMaskRGBToB2)   // %13
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_SPLITRGBROW_SSSE3
-
-#ifdef HAS_MERGERGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
-                                          2u, 128u, 128u, 3u, 128u, 128u,
-                                          4u, 128u, 128u, 5u};
-static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
-                                          128u, 2u, 128u, 128u, 3u, 128u,
-                                          128u, 4u, 128u, 128u};
-static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
-                                          128u, 128u, 2u, 128u, 128u, 3u,
-                                          128u, 128u, 4u, 128u};
-
-static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
-                                          7u, 128u, 128u, 8u, 128u, 128u,
-                                          9u, 128u, 128u, 10u};
-static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
-                                          128u, 7u, 128u, 128u, 8u, 128u,
-                                          128u, 9u, 128u, 128u};
-static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
-                                          128u, 128u, 8u,  128u, 128u, 9u,
-                                          128u, 128u, 10u, 128u};
-
-static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
-                                          12u, 128u, 128u, 13u, 128u, 128u,
-                                          14u, 128u, 128u, 15u};
-static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
-                                          128u, 13u, 128u, 128u, 14u, 128u,
-                                          128u, 15u, 128u, 128u};
-static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
-                                          128u, 128u, 13u, 128u, 128u, 14u,
-                                          128u, 128u, 15u, 128u};
-
-void MergeRGBRow_SSSE3(const uint8_t* src_r,
-                       const uint8_t* src_g,
-                       const uint8_t* src_b,
-                       uint8_t* dst_rgb,
-                       int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %5, %%xmm0                     \n"
-      "pshufb     %6, %%xmm1                     \n"
-      "pshufb     %7, %%xmm2                     \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,(%3)                    \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %8, %%xmm0                     \n"
-      "pshufb     %9, %%xmm1                     \n"
-      "pshufb     %10, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,16(%3)                  \n"
-
-      "movdqu     (%0),%%xmm0                    \n"
-      "movdqu     (%1),%%xmm1                    \n"
-      "movdqu     (%2),%%xmm2                    \n"
-      "pshufb     %11, %%xmm0                    \n"
-      "pshufb     %12, %%xmm1                    \n"
-      "pshufb     %13, %%xmm2                    \n"
-      "por        %%xmm1,%%xmm0                  \n"
-      "por        %%xmm2,%%xmm0                  \n"
-      "movdqu     %%xmm0,32(%3)                  \n"
-
-      "lea        0x10(%0),%0                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "lea        0x10(%2),%2                    \n"
-      "lea        0x30(%3),%3                    \n"
-      "sub        $0x10,%4                       \n"
-      "jg         1b                             \n"
-      : "+r"(src_r),               // %0
-        "+r"(src_g),               // %1
-        "+r"(src_b),               // %2
-        "+r"(dst_rgb),             // %3
-        "+r"(width)                // %4
-      : "m"(kShuffleMaskRToRGB0),  // %5
-        "m"(kShuffleMaskGToRGB0),  // %6
-        "m"(kShuffleMaskBToRGB0),  // %7
-        "m"(kShuffleMaskRToRGB1),  // %8
-        "m"(kShuffleMaskGToRGB1),  // %9
-        "m"(kShuffleMaskBToRGB1),  // %10
-        "m"(kShuffleMaskRToRGB2),  // %11
-        "m"(kShuffleMaskGToRGB2),  // %12
-        "m"(kShuffleMaskBToRGB2)   // %13
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MERGERGBROW_SSSE3
-
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "test       $0xf,%0                        \n"
-      "jne        2f                             \n"
-      "test       $0xf,%1                        \n"
-      "jne        2f                             \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqa    (%0),%%xmm0                     \n"
-      "movdqa    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm0,(%1)                     \n"
-      "movdqa    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "jmp       9f                              \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        2b                              \n"
-
-      LABELALIGN "9:                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "vmovdqu   %%ymm1,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x40,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_COPYROW_AVX
-
-#ifdef HAS_COPYROW_ERMS
-// Multiple of 1.
-void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep movsb                      \n"
-      : "+S"(src),       // %0
-        "+D"(dst),       // %1
-        "+c"(width_tmp)  // %2
-      :
-      : "memory", "cc");
-}
-#endif  // HAS_COPYROW_ERMS
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm0,%%xmm0                   \n"
-      "pslld     $0x18,%%xmm0                    \n"
-      "pcmpeqb   %%xmm1,%%xmm1                   \n"
-      "psrld     $0x8,%%xmm1                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "movdqu    0x10(%0),%%xmm3                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqu    (%1),%%xmm4                     \n"
-      "movdqu    0x10(%1),%%xmm5                 \n"
-      "pand      %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm0,%%xmm3                   \n"
-      "pand      %%xmm1,%%xmm4                   \n"
-      "pand      %%xmm1,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm1                     \n"
-      "vmovdqu   0x20(%0),%%ymm2                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
-      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
-      "vmovdqu   %%ymm1,(%1)                     \n"
-      "vmovdqu   %%ymm2,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0), %%xmm0                    \n"
-      "movdqu    0x10(%0), %%xmm1                \n"
-      "lea       0x20(%0), %0                    \n"
-      "psrld     $0x18, %%xmm0                   \n"
-      "psrld     $0x18, %%xmm1                   \n"
-      "packssdw  %%xmm1, %%xmm0                  \n"
-      "packuswb  %%xmm0, %%xmm0                  \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1), %1                     \n"
-      "sub       $0x8, %2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+rm"(width)     // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-static const uvec8 kShuffleAlphaShort_AVX2 = {
-    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
-    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
-
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "vmovdqa    %3,%%ymm4                      \n"
-      "vbroadcastf128 %4,%%ymm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0), %%ymm0                    \n"
-      "vmovdqu   0x20(%0), %%ymm1                \n"
-      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
-      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
-      "vmovdqu   0x40(%0), %%ymm2                \n"
-      "vmovdqu   0x60(%0), %%ymm3                \n"
-      "lea       0x80(%0), %0                    \n"
-      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
-      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
-      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub        $0x20, %2                      \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),               // %0
-        "+r"(dst_a),                  // %1
-        "+rm"(width)                  // %2
-      : "m"(kPermdARGBToY_AVX),       // %3
-        "m"(kShuffleAlphaShort_AVX2)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm0,%%xmm0                   \n"
-      "pslld     $0x18,%%xmm0                    \n"
-      "pcmpeqb   %%xmm1,%%xmm1                   \n"
-      "psrld     $0x8,%%xmm1                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm2                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm2,%%xmm2                   \n"
-      "punpckhwd %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm2,%%xmm2                   \n"
-      "movdqu    (%1),%%xmm4                     \n"
-      "movdqu    0x10(%1),%%xmm5                 \n"
-      "pand      %%xmm0,%%xmm2                   \n"
-      "pand      %%xmm0,%%xmm3                   \n"
-      "pand      %%xmm1,%%xmm4                   \n"
-      "pand      %%xmm1,%%xmm5                   \n"
-      "por       %%xmm4,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbd (%0),%%ymm1                     \n"
-      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
-      "lea       0x10(%0),%0                     \n"
-      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
-      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
-      "vmovdqu   %%ymm1,(%1)                     \n"
-      "vmovdqu   %%ymm2,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
-  size_t width_tmp = (size_t)(width >> 2);
-  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile(
-
-      "rep stosl                      \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-}
-
-void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep stosb                      \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v8)          // %2
-      : "memory", "cc");
-}
-
-void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
-  size_t width_tmp = (size_t)(width);
-  asm volatile(
-
-      "rep stosl                      \n"
-      : "+D"(dst_argb),  // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_yuy2),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_yuy2))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-
-void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_uyvy),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_uyvy))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrlw     $0x8,%%xmm5                     \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq    %%xmm1,0x00(%1,%2,1)              \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "lea      0x20(%1),%1                      \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_yuy2))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "lea      0x20(%1),%1                      \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),               // %0
-        "+r"(dst_u),                  // %1
-        "+r"(dst_v),                  // %2
-        "+r"(width)                   // %3
-      : "r"((intptr_t)(stride_uyvy))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-      "sub       %1,%2                           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-      "vextractf128 $0x0,%%ymm1,(%1)             \n"
-      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-      "lea      0x10(%1),%1                      \n"
-      "sub       $0x20,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
-                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
-
-// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
-                        const uint8_t* src_argb1,
-                        uint8_t* dst_argb,
-                        int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psrlw     $0xf,%%xmm7                     \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x8,%%xmm6                     \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psllw     $0x8,%%xmm5                     \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "pslld     $0x18,%%xmm4                    \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu    (%0),%%xmm3                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm3,%%xmm0                   \n"
-      "pxor      %%xmm4,%%xmm3                   \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "pshufb    %4,%%xmm3                       \n"
-      "pand      %%xmm6,%%xmm2                   \n"
-      "paddw     %%xmm7,%%xmm3                   \n"
-      "pmullw    %%xmm3,%%xmm2                   \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm2                     \n"
-      "paddusb   %%xmm2,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
-
-      "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        99f                             \n"
-
-      // 1 pixel loop.
-      "91:                                       \n"
-      "movd      (%0),%%xmm3                     \n"
-      "lea       0x4(%0),%0                      \n"
-      "movdqa    %%xmm3,%%xmm0                   \n"
-      "pxor      %%xmm4,%%xmm3                   \n"
-      "movd      (%1),%%xmm2                     \n"
-      "pshufb    %4,%%xmm3                       \n"
-      "pand      %%xmm6,%%xmm2                   \n"
-      "paddw     %%xmm7,%%xmm3                   \n"
-      "pmullw    %%xmm3,%%xmm2                   \n"
-      "movd      (%1),%%xmm1                     \n"
-      "lea       0x4(%1),%1                      \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "por       %%xmm4,%%xmm0                   \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm2                     \n"
-      "paddusb   %%xmm2,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x4(%2),%2                      \n"
-      "sub       $0x1,%3                         \n"
-      "jge       91b                             \n"
-      "99:                                       \n"
-      : "+r"(src_argb0),    // %0
-        "+r"(src_argb1),    // %1
-        "+r"(dst_argb),     // %2
-        "+r"(width)         // %3
-      : "m"(kShuffleAlpha)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8_t* src0,
-                         const uint8_t* src1,
-                         const uint8_t* alpha,
-                         uint8_t* dst,
-                         int width) {
-  asm volatile(
-      "pcmpeqb    %%xmm5,%%xmm5                  \n"
-      "psllw      $0x8,%%xmm5                    \n"
-      "mov        $0x80808080,%%eax              \n"
-      "movd       %%eax,%%xmm6                   \n"
-      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-      "mov        $0x807f807f,%%eax              \n"
-      "movd       %%eax,%%xmm7                   \n"
-      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-      "sub        %2,%0                          \n"
-      "sub        %2,%1                          \n"
-      "sub        %2,%3                          \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq       (%2),%%xmm0                    \n"
-      "punpcklbw  %%xmm0,%%xmm0                  \n"
-      "pxor       %%xmm5,%%xmm0                  \n"
-      "movq       (%0,%2,1),%%xmm1               \n"
-      "movq       (%1,%2,1),%%xmm2               \n"
-      "punpcklbw  %%xmm2,%%xmm1                  \n"
-      "psubb      %%xmm6,%%xmm1                  \n"
-      "pmaddubsw  %%xmm1,%%xmm0                  \n"
-      "paddw      %%xmm7,%%xmm0                  \n"
-      "psrlw      $0x8,%%xmm0                    \n"
-      "packuswb   %%xmm0,%%xmm0                  \n"
-      "movq       %%xmm0,(%3,%2,1)               \n"
-      "lea        0x8(%2),%2                     \n"
-      "sub        $0x8,%4                        \n"
-      "jg        1b                              \n"
-      : "+r"(src0),   // %0
-        "+r"(src1),   // %1
-        "+r"(alpha),  // %2
-        "+r"(dst),    // %3
-        "+rm"(width)  // %4
-        ::"memory",
-        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8_t* src0,
-                        const uint8_t* src1,
-                        const uint8_t* alpha,
-                        uint8_t* dst,
-                        int width) {
-  asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-      "mov        $0x80808080,%%eax              \n"
-      "vmovd      %%eax,%%xmm6                   \n"
-      "vbroadcastss %%xmm6,%%ymm6                \n"
-      "mov        $0x807f807f,%%eax              \n"
-      "vmovd      %%eax,%%xmm7                   \n"
-      "vbroadcastss %%xmm7,%%ymm7                \n"
-      "sub        %2,%0                          \n"
-      "sub        %2,%1                          \n"
-      "sub        %2,%3                          \n"
-
-      // 32 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%2),%%ymm0                    \n"
-      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vmovdqu    (%0,%2,1),%%ymm1               \n"
-      "vmovdqu    (%1,%2,1),%%ymm2               \n"
-      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x20,%4                       \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src0),   // %0
-        "+r"(src1),   // %1
-        "+r"(alpha),  // %2
-        "+r"(dst),    // %3
-        "+rm"(width)  // %4
-        ::"memory",
-        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
-static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
-                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
-// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "pcmpeqb   %%xmm3,%%xmm3                   \n"
-      "pslld     $0x18,%%xmm3                    \n"
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "punpcklbw %%xmm1,%%xmm1                   \n"
-      "pmulhuw   %%xmm1,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "punpckhbw %%xmm2,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "pand      %%xmm3,%%xmm2                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "por       %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),       // %0
-        "+r"(dst_argb),       // %1
-        "+r"(width)           // %2
-      : "m"(kShuffleAlpha0),  // %3
-        "m"(kShuffleAlpha1)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-      "sub        %0,%1                          \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm6                    \n"
-      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub        $0x8,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleAlpha_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width) {
-  uintptr_t alpha;
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movzb     0x03(%0),%3                     \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "movd      0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x07(%0),%3                     \n"
-      "movd      0x00(%4,%3,4),%%xmm3            \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "movlhps   %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "movzb     0x0b(%0),%3                     \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "movd      0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x0f(%0),%3                     \n"
-      "movd      0x00(%4,%3,4),%%xmm3            \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "movlhps   %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "lea       0x10(%0),%0                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),     // %0
-        "+r"(dst_argb),     // %1
-        "+r"(width),        // %2
-        "=&r"(alpha)        // %3
-      : "r"(fixed_invtbl8)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
-// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             int width) {
-  uintptr_t alpha;
-  asm volatile(
-      "sub        %0,%1                          \n"
-      "vbroadcastf128 %5,%%ymm5                  \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      // replace VPGATHER
-      "movzb     0x03(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
-      "movzb     0x07(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
-      "movzb     0x0b(%0),%3                     \n"
-      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x0f(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
-      "movzb     0x13(%0),%3                     \n"
-      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
-      "movzb     0x17(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
-      "movzb     0x1b(%0),%3                     \n"
-      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
-      "movzb     0x1f(%0),%3                     \n"
-      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
-      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-      // end of VPGATHER
-
-      "vmovdqu    (%0),%%ymm6                    \n"
-      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub        $0x8,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),                 // %0
-        "+r"(dst_argb),                 // %1
-        "+r"(width),                    // %2
-        "=&r"(alpha)                    // %3
-      : "r"(fixed_invtbl8),             // %4
-        "m"(kUnattenShuffleAlpha_AVX2)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBUNATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm4,%%xmm0                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm5,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "movdqu    0x10(%0),%%xmm3                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrld     $0x18,%%xmm2                    \n"
-      "psrld     $0x18,%%xmm3                    \n"
-      "packuswb  %%xmm3,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpcklbw %%xmm2,%%xmm3                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm3,%%xmm0                   \n"
-      "punpckhwd %%xmm3,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "m"(kARGBToYJ),  // %3
-        "m"(kAddYJ64)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
-static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
-                                   17, 68, 35, 0, 17, 68, 35, 0};
-
-static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
-                                   22, 88, 45, 0, 22, 88, 45, 0};
-
-static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
-                                   24, 98, 50, 0, 24, 98, 50, 0};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa    %2,%%xmm2                       \n"
-      "movdqa    %3,%%xmm3                       \n"
-      "movdqa    %4,%%xmm4                       \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "pmaddubsw %%xmm2,%%xmm6                   \n"
-      "phaddw    %%xmm6,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm5                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm3,%%xmm5                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm5                   \n"
-      "psrlw     $0x7,%%xmm5                     \n"
-      "packuswb  %%xmm5,%%xmm5                   \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm5                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm4,%%xmm5                   \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "phaddw    %%xmm1,%%xmm5                   \n"
-      "psrlw     $0x7,%%xmm5                     \n"
-      "packuswb  %%xmm5,%%xmm5                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "psrld     $0x18,%%xmm6                    \n"
-      "psrld     $0x18,%%xmm1                    \n"
-      "packuswb  %%xmm1,%%xmm6                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm5                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklwd %%xmm5,%%xmm0                   \n"
-      "punpckhwd %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x8,%1                         \n"
-      "jg        1b                              \n"
-      : "+r"(dst_argb),      // %0
-        "+r"(width)          // %1
-      : "m"(kARGBToSepiaB),  // %2
-        "m"(kARGBToSepiaG),  // %3
-        "m"(kARGBToSepiaR)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              const int8_t* matrix_argb,
-                              int width) {
-  asm volatile(
-      "movdqu    (%3),%%xmm5                     \n"
-      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "pmaddubsw %%xmm2,%%xmm7                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "pmaddubsw %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm3,%%xmm1                   \n"
-      "phaddsw   %%xmm7,%%xmm0                   \n"
-      "phaddsw   %%xmm1,%%xmm6                   \n"
-      "psraw     $0x6,%%xmm0                     \n"
-      "psraw     $0x6,%%xmm6                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm4,%%xmm1                   \n"
-      "pmaddubsw %%xmm4,%%xmm7                   \n"
-      "phaddsw   %%xmm7,%%xmm1                   \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x10(%0),%%xmm7                 \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm7                   \n"
-      "phaddsw   %%xmm7,%%xmm6                   \n"
-      "psraw     $0x6,%%xmm1                     \n"
-      "psraw     $0x6,%%xmm6                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "punpcklbw %%xmm6,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "punpcklwd %%xmm1,%%xmm0                   \n"
-      "punpckhwd %%xmm1,%%xmm6                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm6,0x10(%1)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "movd      %2,%%xmm2                       \n"
-      "movd      %3,%%xmm3                       \n"
-      "movd      %4,%%xmm4                       \n"
-      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "pslld     $0x18,%%xmm6                    \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm1                     \n"
-      "punpckhbw %%xmm5,%%xmm1                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "pmullw    %%xmm3,%%xmm0                   \n"
-      "movdqu    (%0),%%xmm7                     \n"
-      "pmullw    %%xmm3,%%xmm1                   \n"
-      "pand      %%xmm6,%%xmm7                   \n"
-      "paddw     %%xmm4,%%xmm0                   \n"
-      "paddw     %%xmm4,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "por       %%xmm7,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "sub       $0x4,%1                         \n"
-      "jg        1b                              \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "movd      %3,%%xmm2                       \n"
-      "punpcklbw %%xmm2,%%xmm2                   \n"
-      "punpcklqdq %%xmm2,%%xmm2                  \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "pmulhuw   %%xmm2,%%xmm1                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm2                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqu    %%xmm0,%%xmm1                   \n"
-      "movdqu    %%xmm2,%%xmm3                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpckhbw %%xmm5,%%xmm3                   \n"
-      "pmulhuw   %%xmm2,%%xmm0                   \n"
-      "pmulhuw   %%xmm3,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm1                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vmovdqu    (%1),%%ymm3                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea       0x20(%2),%2                     \n"
-      "sub        $0x8,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc"
-#if defined(__AVX2__)
-        ,
-        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
-      "lea        0x20(%1),%1                    \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x8,%3                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqu    (%1),%%xmm1                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "psubusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "lea        0x20(%0),%0                    \n"
-      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
-      "lea        0x20(%1),%1                    \n"
-      "vmovdqu    %%ymm0,(%2)                    \n"
-      "lea        0x20(%2),%2                    \n"
-      "sub        $0x8,%3                        \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0");
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "sub       %0,%1                           \n"
-      "sub       %0,%2                           \n"
-      "sub       %0,%3                           \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movq      0x2(%0),%%xmm1                  \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "psubw     %%xmm1,%%xmm0                   \n"
-      "movq      0x00(%0,%1,1),%%xmm1            \n"
-      "movq      0x02(%0,%1,1),%%xmm2            \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "psubw     %%xmm2,%%xmm1                   \n"
-      "movq      0x00(%0,%2,1),%%xmm2            \n"
-      "movq      0x02(%0,%2,1),%%xmm3            \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "psubw     %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm2,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "psubw     %%xmm0,%%xmm1                   \n"
-      "pmaxsw    %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,0x00(%0,%3,1)            \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x8,%4                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_y0),      // %0
-        "+r"(src_y1),      // %1
-        "+r"(src_y2),      // %2
-        "+r"(dst_sobelx),  // %3
-        "+r"(width)        // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_SSE2(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "sub       %0,%1                           \n"
-      "sub       %0,%2                           \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movq      0x00(%0,%1,1),%%xmm1            \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "psubw     %%xmm1,%%xmm0                   \n"
-      "movq      0x1(%0),%%xmm1                  \n"
-      "movq      0x01(%0,%1,1),%%xmm2            \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "psubw     %%xmm2,%%xmm1                   \n"
-      "movq      0x2(%0),%%xmm2                  \n"
-      "movq      0x02(%0,%1,1),%%xmm3            \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "psubw     %%xmm3,%%xmm2                   \n"
-      "paddw     %%xmm2,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "paddw     %%xmm1,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "psubw     %%xmm0,%%xmm1                   \n"
-      "pmaxsw    %%xmm1,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,0x00(%0,%2,1)            \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x8,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_y0),      // %0
-        "+r"(src_y1),      // %1
-        "+r"(dst_sobely),  // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_SSE2(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "punpcklbw %%xmm0,%%xmm2                   \n"
-      "punpckhbw %%xmm0,%%xmm0                   \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "punpcklwd %%xmm2,%%xmm1                   \n"
-      "punpckhwd %%xmm2,%%xmm2                   \n"
-      "por       %%xmm5,%%xmm1                   \n"
-      "por       %%xmm5,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklwd %%xmm0,%%xmm3                   \n"
-      "punpckhwd %%xmm0,%%xmm0                   \n"
-      "por       %%xmm5,%%xmm3                   \n"
-      "por       %%xmm5,%%xmm0                   \n"
-      "movdqu    %%xmm1,(%2)                     \n"
-      "movdqu    %%xmm2,0x10(%2)                 \n"
-      "movdqu    %%xmm3,0x20(%2)                 \n"
-      "movdqu    %%xmm0,0x30(%2)                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "pslld     $0x18,%%xmm5                    \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1");
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_SSE2(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "sub       %0,%1                           \n"
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-      // 8 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "paddusb   %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm0,%%xmm3                   \n"
-      "punpcklbw %%xmm5,%%xmm3                   \n"
-      "punpckhbw %%xmm5,%%xmm0                   \n"
-      "movdqa    %%xmm1,%%xmm4                   \n"
-      "punpcklbw %%xmm2,%%xmm4                   \n"
-      "punpckhbw %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm4,%%xmm6                   \n"
-      "punpcklwd %%xmm3,%%xmm6                   \n"
-      "punpckhwd %%xmm3,%%xmm4                   \n"
-      "movdqa    %%xmm1,%%xmm7                   \n"
-      "punpcklwd %%xmm0,%%xmm7                   \n"
-      "punpckhwd %%xmm0,%%xmm1                   \n"
-      "movdqu    %%xmm6,(%2)                     \n"
-      "movdqu    %%xmm4,0x10(%2)                 \n"
-      "movdqu    %%xmm7,0x20(%2)                 \n"
-      "movdqu    %%xmm1,0x30(%2)                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "sub       $0x10,%3                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
-                                  int32_t* cumsum,
-                                  const int32_t* previous_cumsum,
-                                  int width) {
-  asm volatile(
-      "pxor      %%xmm0,%%xmm0                   \n"
-      "pxor      %%xmm1,%%xmm1                   \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
-      "test      $0xf,%1                         \n"
-      "jne       49f                             \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu    (%0),%%xmm2                     \n"
-      "lea       0x10(%0),%0                     \n"
-      "movdqa    %%xmm2,%%xmm4                   \n"
-      "punpcklbw %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "punpcklwd %%xmm1,%%xmm2                   \n"
-      "punpckhwd %%xmm1,%%xmm3                   \n"
-      "punpckhbw %%xmm1,%%xmm4                   \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "punpcklwd %%xmm1,%%xmm4                   \n"
-      "punpckhwd %%xmm1,%%xmm5                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "movdqu    (%2),%%xmm2                     \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "movdqu    0x10(%2),%%xmm3                 \n"
-      "paddd     %%xmm0,%%xmm3                   \n"
-      "paddd     %%xmm4,%%xmm0                   \n"
-      "movdqu    0x20(%2),%%xmm4                 \n"
-      "paddd     %%xmm0,%%xmm4                   \n"
-      "paddd     %%xmm5,%%xmm0                   \n"
-      "movdqu    0x30(%2),%%xmm5                 \n"
-      "lea       0x40(%2),%2                     \n"
-      "paddd     %%xmm0,%%xmm5                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "movdqu    %%xmm3,0x10(%1)                 \n"
-      "movdqu    %%xmm4,0x20(%1)                 \n"
-      "movdqu    %%xmm5,0x30(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
-
-      "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        19f                             \n"
-
-      // 1 pixel loop.
-      LABELALIGN
-      "10:                                       \n"
-      "movd      (%0),%%xmm2                     \n"
-      "lea       0x4(%0),%0                      \n"
-      "punpcklbw %%xmm1,%%xmm2                   \n"
-      "punpcklwd %%xmm1,%%xmm2                   \n"
-      "paddd     %%xmm2,%%xmm0                   \n"
-      "movdqu    (%2),%%xmm2                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "movdqu    %%xmm2,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x1,%3                         \n"
-      "jge       10b                             \n"
-
-      "19:                                       \n"
-      : "+r"(row),              // %0
-        "+r"(cumsum),           // %1
-        "+r"(previous_cumsum),  // %2
-        "+r"(width)             // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
-                                    const int32_t* botleft,
-                                    int width,
-                                    int area,
-                                    uint8_t* dst,
-                                    int count) {
-  asm volatile(
-      "movd      %5,%%xmm5                       \n"
-      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-      "rcpss     %%xmm5,%%xmm4                   \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-      "sub       $0x4,%3                         \n"
-      "jl        49f                             \n"
-      "cmpl      $0x80,%5                        \n"
-      "ja        40f                             \n"
-
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrld     $0x10,%%xmm6                    \n"
-      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-      "addps     %%xmm6,%%xmm5                   \n"
-      "mulps     %%xmm4,%%xmm5                   \n"
-      "cvtps2dq  %%xmm5,%%xmm5                   \n"
-      "packssdw  %%xmm5,%%xmm5                   \n"
-
-      // 4 pixel small loop.
-      LABELALIGN
-      "4:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "psubd     0x10(%0,%4,4),%%xmm1            \n"
-      "psubd     0x20(%0,%4,4),%%xmm2            \n"
-      "psubd     0x30(%0,%4,4),%%xmm3            \n"
-      "lea       0x40(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "psubd     0x10(%1),%%xmm1                 \n"
-      "psubd     0x20(%1),%%xmm2                 \n"
-      "psubd     0x30(%1),%%xmm3                 \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "paddd     0x10(%1,%4,4),%%xmm1            \n"
-      "paddd     0x20(%1,%4,4),%%xmm2            \n"
-      "paddd     0x30(%1,%4,4),%%xmm3            \n"
-      "lea       0x40(%1),%1                     \n"
-      "packssdw  %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm3,%%xmm2                   \n"
-      "pmulhuw   %%xmm5,%%xmm0                   \n"
-      "pmulhuw   %%xmm5,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       4b                              \n"
-      "jmp       49f                             \n"
-
-      // 4 pixel loop
-      LABELALIGN
-      "40:                                       \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x20(%0),%%xmm2                 \n"
-      "movdqu    0x30(%0),%%xmm3                 \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "psubd     0x10(%0,%4,4),%%xmm1            \n"
-      "psubd     0x20(%0,%4,4),%%xmm2            \n"
-      "psubd     0x30(%0,%4,4),%%xmm3            \n"
-      "lea       0x40(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "psubd     0x10(%1),%%xmm1                 \n"
-      "psubd     0x20(%1),%%xmm2                 \n"
-      "psubd     0x30(%1),%%xmm3                 \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "paddd     0x10(%1,%4,4),%%xmm1            \n"
-      "paddd     0x20(%1,%4,4),%%xmm2            \n"
-      "paddd     0x30(%1,%4,4),%%xmm3            \n"
-      "lea       0x40(%1),%1                     \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-      "mulps     %%xmm4,%%xmm0                   \n"
-      "mulps     %%xmm4,%%xmm1                   \n"
-      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-      "mulps     %%xmm4,%%xmm2                   \n"
-      "mulps     %%xmm4,%%xmm3                   \n"
-      "cvtps2dq  %%xmm0,%%xmm0                   \n"
-      "cvtps2dq  %%xmm1,%%xmm1                   \n"
-      "cvtps2dq  %%xmm2,%%xmm2                   \n"
-      "cvtps2dq  %%xmm3,%%xmm3                   \n"
-      "packssdw  %%xmm1,%%xmm0                   \n"
-      "packssdw  %%xmm3,%%xmm2                   \n"
-      "packuswb  %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jge       40b                             \n"
-
-      "49:                                       \n"
-      "add       $0x3,%3                         \n"
-      "jl        19f                             \n"
-
-      // 1 pixel loop
-      LABELALIGN
-      "10:                                       \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "psubd     0x00(%0,%4,4),%%xmm0            \n"
-      "lea       0x10(%0),%0                     \n"
-      "psubd     (%1),%%xmm0                     \n"
-      "paddd     0x00(%1,%4,4),%%xmm0            \n"
-      "lea       0x10(%1),%1                     \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "mulps     %%xmm4,%%xmm0                   \n"
-      "cvtps2dq  %%xmm0,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x4(%2),%2                      \n"
-      "sub       $0x1,%3                         \n"
-      "jge       10b                             \n"
-      "19:                                       \n"
-      : "+r"(topleft),           // %0
-        "+r"(botleft),           // %1
-        "+r"(dst),               // %2
-        "+rm"(count)             // %3
-      : "r"((intptr_t)(width)),  // %4
-        "rm"(area)               // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                        int src_argb_stride,
-                        uint8_t* dst_argb,
-                        const float* src_dudv,
-                        int width) {
-  intptr_t src_argb_stride_temp = src_argb_stride;
-  intptr_t temp;
-  asm volatile(
-      "movq      (%3),%%xmm2                     \n"
-      "movq      0x08(%3),%%xmm7                 \n"
-      "shl       $0x10,%1                        \n"
-      "add       $0x4,%1                         \n"
-      "movd      %1,%%xmm5                       \n"
-      "sub       $0x4,%4                         \n"
-      "jl        49f                             \n"
-
-      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "addps     %%xmm7,%%xmm0                   \n"
-      "movlhps   %%xmm0,%%xmm2                   \n"
-      "movdqa    %%xmm7,%%xmm4                   \n"
-      "addps     %%xmm4,%%xmm4                   \n"
-      "movdqa    %%xmm2,%%xmm3                   \n"
-      "addps     %%xmm4,%%xmm3                   \n"
-      "addps     %%xmm4,%%xmm4                   \n"
-
-      // 4 pixel loop
-      LABELALIGN
-      "40:                                       \n"
-      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
-      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
-      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
-      "movd      %%xmm0,%k1                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      %%xmm0,%k5                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      0x00(%0,%1,1),%%xmm1            \n"
-      "movd      0x00(%0,%5,1),%%xmm6            \n"
-      "punpckldq %%xmm6,%%xmm1                   \n"
-      "addps     %%xmm4,%%xmm2                   \n"
-      "movq      %%xmm1,(%2)                     \n"
-      "movd      %%xmm0,%k1                      \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-      "movd      %%xmm0,%k5                      \n"
-      "movd      0x00(%0,%1,1),%%xmm0            \n"
-      "movd      0x00(%0,%5,1),%%xmm6            \n"
-      "punpckldq %%xmm6,%%xmm0                   \n"
-      "addps     %%xmm4,%%xmm3                   \n"
-      "movq      %%xmm0,0x08(%2)                 \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%4                         \n"
-      "jge       40b                             \n"
-
-      "49:                                       \n"
-      "add       $0x3,%4                         \n"
-      "jl        19f                             \n"
-
-      // 1 pixel loop
-      LABELALIGN
-      "10:                                       \n"
-      "cvttps2dq %%xmm2,%%xmm0                   \n"
-      "packssdw  %%xmm0,%%xmm0                   \n"
-      "pmaddwd   %%xmm5,%%xmm0                   \n"
-      "addps     %%xmm7,%%xmm2                   \n"
-      "movd      %%xmm0,%k1                      \n"
-      "movd      0x00(%0,%1,1),%%xmm0            \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "lea       0x04(%2),%2                     \n"
-      "sub       $0x1,%4                         \n"
-      "jge       10b                             \n"
-      "19:                                       \n"
-      : "+r"(src_argb),              // %0
-        "+r"(src_argb_stride_temp),  // %1
-        "+r"(dst_argb),              // %2
-        "+r"(src_dudv),              // %3
-        "+rm"(width),                // %4
-        "=&r"(temp)                  // %5
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  asm volatile(
-      "sub       %1,%0                           \n"
-      "cmp       $0x0,%3                         \n"
-      "je        100f                            \n"
-      "cmp       $0x80,%3                        \n"
-      "je        50f                             \n"
-
-      "movd      %3,%%xmm0                       \n"
-      "neg       %3                              \n"
-      "add       $0x100,%3                       \n"
-      "movd      %3,%%xmm5                       \n"
-      "punpcklbw %%xmm0,%%xmm5                   \n"
-      "punpcklwd %%xmm5,%%xmm5                   \n"
-      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-      "mov       $0x80808080,%%eax               \n"
-      "movd      %%eax,%%xmm4                    \n"
-      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-
-      // General purpose row blend.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
-      "movdqa     %%xmm0,%%xmm1                  \n"
-      "punpcklbw  %%xmm2,%%xmm0                  \n"
-      "punpckhbw  %%xmm2,%%xmm1                  \n"
-      "psubb      %%xmm4,%%xmm0                  \n"
-      "psubb      %%xmm4,%%xmm1                  \n"
-      "movdqa     %%xmm5,%%xmm2                  \n"
-      "movdqa     %%xmm5,%%xmm3                  \n"
-      "pmaddubsw  %%xmm0,%%xmm2                  \n"
-      "pmaddubsw  %%xmm1,%%xmm3                  \n"
-      "paddw      %%xmm4,%%xmm2                  \n"
-      "paddw      %%xmm4,%%xmm3                  \n"
-      "psrlw      $0x8,%%xmm2                    \n"
-      "psrlw      $0x8,%%xmm3                    \n"
-      "packuswb   %%xmm3,%%xmm2                  \n"
-      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      "jmp       99f                             \n"
-
-      // Blend 50 / 50.
-      LABELALIGN
-      "50:                                       \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
-      "pavgb     %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        50b                             \n"
-      "jmp       99f                             \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      LABELALIGN
-      "100:                                      \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        100b                            \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),               // %0
-        "+r"(src_ptr),               // %1
-        "+rm"(dst_width),            // %2
-        "+r"(source_y_fraction)      // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  asm volatile(
-      "cmp       $0x0,%3                         \n"
-      "je        100f                            \n"
-      "sub       %1,%0                           \n"
-      "cmp       $0x80,%3                        \n"
-      "je        50f                             \n"
-
-      "vmovd      %3,%%xmm0                      \n"
-      "neg        %3                             \n"
-      "add        $0x100,%3                      \n"
-      "vmovd      %3,%%xmm5                      \n"
-      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-      "vbroadcastss %%xmm5,%%ymm5                \n"
-      "mov        $0x80808080,%%eax              \n"
-      "vmovd      %%eax,%%xmm4                   \n"
-      "vbroadcastss %%xmm4,%%ymm4                \n"
-
-      // General purpose row blend.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%1),%%ymm0                    \n"
-      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
-      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "jmp        99f                            \n"
-
-      // Blend 50 / 50.
-      LABELALIGN
-      "50:                                       \n"
-      "vmovdqu   (%1),%%ymm0                     \n"
-      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
-      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        50b                             \n"
-      "jmp       99f                             \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      LABELALIGN
-      "100:                                      \n"
-      "rep movsb                                 \n"
-      "jmp       999f                            \n"
-
-      "99:                                       \n"
-      "vzeroupper                                \n"
-      "999:                                      \n"
-      : "+D"(dst_ptr),               // %0
-        "+S"(src_ptr),               // %1
-        "+cm"(dst_width),            // %2
-        "+r"(source_y_fraction)      // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          const uint8_t* shuffler,
-                          int width) {
-  asm volatile(
-
-      "movdqu    (%3),%%xmm5                     \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pshufb    %%xmm5,%%xmm0                   \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSSE3
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-
-      "vbroadcastf128 (%3),%%ymm5                \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu   (%0),%%ymm0                     \n"
-      "vmovdqu   0x20(%0),%%ymm1                 \n"
-      "lea       0x40(%0),%0                     \n"
-      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-      "vmovdqu   %%ymm0,(%1)                     \n"
-      "vmovdqu   %%ymm1,0x20(%1)                 \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-
-      "sub       %1,%2                             \n"
-
-      LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "add       $0x10,%0                          \n"
-      "movdqa    %%xmm0,%%xmm1                     \n"
-      "punpcklbw %%xmm2,%%xmm0                     \n"
-      "punpckhbw %%xmm2,%%xmm1                     \n"
-      "movdqu    %%xmm0,(%3)                       \n"
-      "movdqu    %%xmm1,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOYUY2ROW_SSE2
-
-#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-
-      "sub        %1,%2                            \n"
-
-      LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "movdqa    %%xmm2,%%xmm1                     \n"
-      "add       $0x10,%0                          \n"
-      "punpcklbw %%xmm0,%%xmm1                     \n"
-      "punpckhbw %%xmm0,%%xmm2                     \n"
-      "movdqu    %%xmm1,(%3)                       \n"
-      "movdqu    %%xmm2,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOUYVYROW_SSE2
-
-#ifdef HAS_I422TOYUY2ROW_AVX2
-void I422ToYUY2Row_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-
-      "sub       %1,%2                             \n"
-
-      LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
-      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOYUY2ROW_AVX2
-
-#ifdef HAS_I422TOUYVYROW_AVX2
-void I422ToUYVYRow_AVX2(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-
-      "sub        %1,%2                            \n"
-
-      LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
-      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_I422TOUYVYROW_AVX2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width) {
-  asm volatile(
-
-      "pxor      %%xmm3,%%xmm3                   \n"
-
-      // 2 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "punpcklbw %%xmm3,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm4                   \n"
-      "punpcklwd %%xmm3,%%xmm0                   \n"
-      "punpckhwd %%xmm3,%%xmm4                   \n"
-      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "movdqa    %%xmm4,%%xmm5                   \n"
-      "mulps     0x10(%3),%%xmm0                 \n"
-      "mulps     0x10(%3),%%xmm4                 \n"
-      "addps     (%3),%%xmm0                     \n"
-      "addps     (%3),%%xmm4                     \n"
-      "movdqa    %%xmm1,%%xmm2                   \n"
-      "movdqa    %%xmm5,%%xmm6                   \n"
-      "mulps     %%xmm1,%%xmm2                   \n"
-      "mulps     %%xmm5,%%xmm6                   \n"
-      "mulps     %%xmm2,%%xmm1                   \n"
-      "mulps     %%xmm6,%%xmm5                   \n"
-      "mulps     0x20(%3),%%xmm2                 \n"
-      "mulps     0x20(%3),%%xmm6                 \n"
-      "mulps     0x30(%3),%%xmm1                 \n"
-      "mulps     0x30(%3),%%xmm5                 \n"
-      "addps     %%xmm2,%%xmm0                   \n"
-      "addps     %%xmm6,%%xmm4                   \n"
-      "addps     %%xmm1,%%xmm0                   \n"
-      "addps     %%xmm5,%%xmm4                   \n"
-      "cvttps2dq %%xmm0,%%xmm0                   \n"
-      "cvttps2dq %%xmm4,%%xmm4                   \n"
-      "packuswb  %%xmm4,%%xmm0                   \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x2,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const float* poly,
-                            int width) {
-  asm volatile(
-      "vbroadcastf128 (%3),%%ymm4                \n"
-      "vbroadcastf128 0x10(%3),%%ymm5            \n"
-      "vbroadcastf128 0x20(%3),%%ymm6            \n"
-      "vbroadcastf128 0x30(%3),%%ymm7            \n"
-
-      // 2 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
-      "lea         0x8(%0),%0                    \n"
-      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
-                                                      // X
-      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-      "vmovq       %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x2,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_SSE2
-static float kScaleBias = 1.9259299444e-34f;
-void HalfFloatRow_SSE2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  scale *= kScaleBias;
-  asm volatile(
-      "movd        %3,%%xmm4                     \n"
-      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-      "pxor        %%xmm5,%%xmm5                 \n"
-      "sub         %0,%1                         \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
-      "add         $0x10,%0                      \n"
-      "movdqa      %%xmm2,%%xmm3                 \n"
-      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
-      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
-      "punpckhwd   %%xmm5,%%xmm3                 \n"
-      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
-      "mulps       %%xmm4,%%xmm2                 \n"
-      "mulps       %%xmm4,%%xmm3                 \n"
-      "psrld       $0xd,%%xmm2                   \n"
-      "psrld       $0xd,%%xmm3                   \n"
-      "packssdw    %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "m"(scale)   // %3
-      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_HALFFLOATROW_SSE2
-
-#ifdef HAS_HALFFLOATROW_AVX2
-void HalfFloatRow_AVX2(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  scale *= kScaleBias;
-  asm volatile(
-      "vbroadcastss  %3, %%ymm4                  \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-      "sub        %0,%1                          \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
-      "add        $0x20,%0                       \n"
-      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
-      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
-      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
-      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
-      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
-      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
-      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
-      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
-      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
-      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
-
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-#if defined(__x86_64__)
-      : "x"(scale)  // %3
-#else
-      : "m"(scale)  // %3
-#endif
-      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_HALFFLOATROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloatRow_F16C(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-      "vbroadcastss  %3, %%ymm4                  \n"
-      "sub        %0,%1                          \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-      "vpmovzxwd   0x10(%0),%%ymm3               \n"
-      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
-      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
-      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
-      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
-      "add         $0x20,%0                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-#if defined(__x86_64__)
-      : "x"(scale)  // %3
-#else
-      : "m"(scale)  // %3
-#endif
-      : "memory", "cc", "xmm2", "xmm3", "xmm4");
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-  asm volatile(
-      "sub        %0,%1                          \n"
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-      "vpmovzxwd   0x10(%0),%%ymm3               \n"
-      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
-      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
-      "add         $0x20,%0                      \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper                                \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "memory", "cc", "xmm2", "xmm3");
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8_t* dst_argb,
-                           const uint8_t* table_argb,
-                           int width) {
-  uintptr_t pixel_temp;
-  asm volatile(
-      // 1 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movzb     (%0),%1                         \n"
-      "lea       0x4(%0),%0                      \n"
-      "movzb     0x00(%3,%1,4),%1                \n"
-      "mov       %b1,-0x4(%0)                    \n"
-      "movzb     -0x3(%0),%1                     \n"
-      "movzb     0x01(%3,%1,4),%1                \n"
-      "mov       %b1,-0x3(%0)                    \n"
-      "movzb     -0x2(%0),%1                     \n"
-      "movzb     0x02(%3,%1,4),%1                \n"
-      "mov       %b1,-0x2(%0)                    \n"
-      "movzb     -0x1(%0),%1                     \n"
-      "movzb     0x03(%3,%1,4),%1                \n"
-      "mov       %b1,-0x1(%0)                    \n"
-      "dec       %2                              \n"
-      "jg        1b                              \n"
-      : "+r"(dst_argb),     // %0
-        "=&d"(pixel_temp),  // %1
-        "+r"(width)         // %2
-      : "r"(table_argb)     // %3
-      : "memory", "cc");
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8_t* dst_argb,
-                          const uint8_t* table_argb,
-                          int width) {
-  uintptr_t pixel_temp;
-  asm volatile(
-      // 1 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movzb     (%0),%1                         \n"
-      "lea       0x4(%0),%0                      \n"
-      "movzb     0x00(%3,%1,4),%1                \n"
-      "mov       %b1,-0x4(%0)                    \n"
-      "movzb     -0x3(%0),%1                     \n"
-      "movzb     0x01(%3,%1,4),%1                \n"
-      "mov       %b1,-0x3(%0)                    \n"
-      "movzb     -0x2(%0),%1                     \n"
-      "movzb     0x02(%3,%1,4),%1                \n"
-      "mov       %b1,-0x2(%0)                    \n"
-      "dec       %2                              \n"
-      "jg        1b                              \n"
-      : "+r"(dst_argb),     // %0
-        "=&d"(pixel_temp),  // %1
-        "+r"(width)         // %2
-      : "r"(table_argb)     // %3
-      : "memory", "cc");
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
-                                 uint8_t* dst_argb,
-                                 int width,
-                                 const uint8_t* luma,
-                                 uint32_t lumacoeff) {
-  uintptr_t pixel_temp;
-  uintptr_t table_temp;
-  asm volatile(
-      "movd      %6,%%xmm3                       \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pcmpeqb   %%xmm4,%%xmm4                   \n"
-      "psllw     $0x8,%%xmm4                     \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      // 4 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%2),%%xmm0                     \n"
-      "pmaddubsw %%xmm3,%%xmm0                   \n"
-      "phaddw    %%xmm0,%%xmm0                   \n"
-      "pand      %%xmm4,%%xmm0                   \n"
-      "punpcklwd %%xmm5,%%xmm0                   \n"
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     (%2),%0                         \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,(%3)                        \n"
-      "movzb     0x1(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x1(%3)                     \n"
-      "movzb     0x2(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x2(%3)                     \n"
-      "movzb     0x3(%2),%0                      \n"
-      "mov       %b0,0x3(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     0x4(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x4(%3)                     \n"
-      "movzb     0x5(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x5(%3)                     \n"
-      "movzb     0x6(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x6(%3)                     \n"
-      "movzb     0x7(%2),%0                      \n"
-      "mov       %b0,0x7(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-      "movzb     0x8(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x8(%3)                     \n"
-      "movzb     0x9(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0x9(%3)                     \n"
-      "movzb     0xa(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xa(%3)                     \n"
-      "movzb     0xb(%2),%0                      \n"
-      "mov       %b0,0xb(%3)                     \n"
-
-      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-      "add       %5,%1                           \n"
-
-      "movzb     0xc(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xc(%3)                     \n"
-      "movzb     0xd(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xd(%3)                     \n"
-      "movzb     0xe(%2),%0                      \n"
-      "movzb     0x00(%1,%0,1),%0                \n"
-      "mov       %b0,0xe(%3)                     \n"
-      "movzb     0xf(%2),%0                      \n"
-      "mov       %b0,0xf(%3)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "lea       0x10(%3),%3                     \n"
-      "sub       $0x4,%4                         \n"
-      "jg        1b                              \n"
-      : "=&d"(pixel_temp),  // %0
-        "=&a"(table_temp),  // %1
-        "+r"(src_argb),     // %2
-        "+r"(dst_argb),     // %3
-        "+rm"(width)        // %4
-      : "r"(luma),          // %5
-        "rm"(lumacoeff)     // %6
-      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-
-// begin NV21ToYUV24Row_C avx2 constants
-static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
-                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
-
-static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
-
-static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
-
-static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
-                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
-                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
-                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
-
-static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
-                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
-                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
-                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
-
-static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
-                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
-                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
-                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
-
-static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
-                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
-                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
-                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
-
-static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
-                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
-                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
-                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
-
-static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
-                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
-                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
-                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
-
-// NV21ToYUV24Row_AVX2
-void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  uint8_t* src_y_ptr;
-  uint64_t src_offset = 0;
-  uint64_t width64;
-
-  width64 = width;
-  src_y_ptr = (uint8_t*)src_y;
-
-  asm volatile(
-      "vmovdqu     %5, %%ymm0 \n"  // init blend value
-      "vmovdqu     %6, %%ymm1 \n"  // init blend value
-      "vmovdqu     %7, %%ymm2 \n"  // init blend value
-      //      "sub         $0x20, %3  \n"  //sub 32 from width for final loop
-
-      LABELALIGN
-      "1:                                             \n"  // label 1
-      "vmovdqu     (%0,%4), %%ymm3                    \n"  // src_y
-      "vmovdqu     1(%1,%4), %%ymm4                   \n"  // src_uv+1
-      "vmovdqu     (%1), %%ymm5                       \n"  // src_uv
-      "vpshufb     %8, %%ymm3, %%ymm13                \n"  // y, kSHUF0 for shuf
-      "vpshufb     %9, %%ymm4, %%ymm14                \n"  // uv+1, kSHUF1 for
-                                                           // shuf
-      "vpshufb     %10, %%ymm5, %%ymm15               \n"  // uv, kSHUF2 for
-                                                           // shuf
-      "vpshufb     %11, %%ymm3, %%ymm3                \n"  // y kSHUF3 for shuf
-      "vpshufb     %12, %%ymm4, %%ymm4                \n"  // uv+1 kSHUF4 for
-                                                           // shuf
-      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12  \n"  // blend 0
-      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14  \n"  // blend 0
-      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12  \n"  // blend 2
-      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13  \n"  // blend 1
-      "vpshufb     %13, %%ymm5, %%ymm15               \n"  // shuffle const
-      "vpor        %%ymm4, %%ymm3, %%ymm5             \n"  // get results
-      "vmovdqu     %%ymm12, 0x20(%2)                  \n"  // store dst_yuv+20h
-      "vpor        %%ymm15, %%ymm5, %%ymm3            \n"  // get results
-      "add         $0x20, %4                          \n"  // add to src buffer
-                                                           // ptr
-      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4      \n"  // insert
-      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5     \n"  // insert
-      "vmovdqu     %%ymm4, (%2)                       \n"  // store dst_yuv
-      "vmovdqu     %%ymm5, 0x40(%2)                   \n"  // store dst_yuv+40h
-      "add         $0x60,%2                           \n"  // add to dst buffer
-                                                           // ptr
-      //      "cmp         %3, %4                             \n" //(width64 -
-      //      32 bytes) and src_offset
-      "sub         $0x20,%3                           \n"  // 32 pixels per loop
-      "jg          1b                                 \n"
-      "vzeroupper                                     \n"  // sse-avx2
-                                                           // transistions
-
-      : "+r"(src_y),      //%0
-        "+r"(src_vu),     //%1
-        "+r"(dst_yuv24),  //%2
-        "+r"(width64),    //%3
-        "+r"(src_offset)  //%4
-      : "m"(kBLEND0),     //%5
-        "m"(kBLEND1),     //%6
-        "m"(kBLEND2),     //%7
-        "m"(kSHUF0),      //%8
-        "m"(kSHUF1),      //%9
-        "m"(kSHUF2),      //%10
-        "m"(kSHUF3),      //%11
-        "m"(kSHUF4),      //%12
-        "m"(kSHUF5)       //%13
-      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
-        "xmm13", "xmm14", "xmm15");
-}
-#endif  // HAS_NV21TOYUV24ROW_AVX2
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
deleted file mode 100644
index d8726d09..00000000
--- a/files/source/row_mmi.cc
+++ /dev/null
@@ -1,6042 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask = 0xff000000ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xff000000ULL;
-  const uint64_t mask2 = 0xc6;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
-      : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x6c;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
-      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
-      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
-      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
-      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
-
-      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
-      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
-      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
-      : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t ftmp[5];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                      \n\t"
-      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
-      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
-      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
-      "and       %[b],          %[src0],             %[c0]     \n\t"
-      "and       %[src0],       %[src0],             %[c1]     \n\t"
-      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
-      "and       %[g],          %[src1],             %[c2]     \n\t"
-      "psllh     %[g],          %[g],                %[three]  \n\t"
-      "or        %[g],          %[src0],             %[g]      \n\t"
-      "psrlh     %[r],          %[src1],             %[three]  \n\t"
-      "psllh     %[src0],       %[b],                %[three]  \n\t"
-      "psrlh     %[src1],       %[b],                %[two]    \n\t"
-      "or        %[b],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[g],                %[two]    \n\t"
-      "psrlh     %[src1],       %[g],                %[four]   \n\t"
-      "or        %[g],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[r],                %[three]  \n\t"
-      "psrlh     %[src1],       %[r],                %[two]    \n\t"
-      "or        %[r],          %[src0],             %[src1]   \n\t"
-      "packushb  %[b],          %[b],                %[r]      \n\t"
-      "packushb  %[g],          %[g],                %[c1]     \n\t"
-      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
-      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
-      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
-      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
-      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
-      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
-      "daddiu    %[width],      %[width],           -0x04      \n\t"
-      "bgtz      %[width],     1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
-      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  uint64_t c4 = 0x0001000100010001;
-  __asm__ volatile(
-      "1:                                                         \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
-      "and       %[b],            %[src0],              %[c0]     \n\t"
-      "and       %[src0],         %[src0],              %[c1]     \n\t"
-      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
-      "and       %[g],            %[src1],              %[c2]     \n\t"
-      "psllh     %[g],            %[g],                 %[three]  \n\t"
-      "or        %[g],            %[src0],              %[g]      \n\t"
-      "and       %[r],            %[src1],              %[c3]     \n\t"
-      "psrlh     %[r],            %[r],                 %[two]    \n\t"
-      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
-      "psllh     %[src0],         %[b],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
-      "or        %[b],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[g],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
-      "or        %[g],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[r],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
-      "or        %[r],            %[src0],              %[src1]   \n\t"
-      "xor       %[a],            %[a],                 %[c1]     \n\t"
-      "paddb     %[a],            %[a],                 %[c4]     \n\t"
-      "packushb  %[b],            %[b],                 %[r]      \n\t"
-      "packushb  %[g],            %[g],                 %[a]      \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
-      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
-      "daddiu    %[width],        %[width],            -0x04      \n\t"
-      "bgtz      %[width],        1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                          \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
-      "and       %[b],            %[src0],              %[c0]      \n\t"
-      "and       %[src0],         %[src0],              %[c1]      \n\t"
-      "psrlh     %[g],            %[src0],              %[four]    \n\t"
-      "and       %[r],            %[src1],              %[c0]      \n\t"
-      "psrlh     %[a],            %[src1],              %[four]    \n\t"
-      "psllh     %[src0],         %[b],                 %[four]    \n\t"
-      "or        %[b],            %[src0],              %[b]       \n\t"
-      "psllh     %[src0],         %[g],                 %[four]    \n\t"
-      "or        %[g],            %[src0],              %[g]       \n\t"
-      "psllh     %[src0],         %[r],                 %[four]    \n\t"
-      "or        %[r],            %[src0],              %[r]       \n\t"
-      "psllh     %[src0],         %[a],                 %[four]    \n\t"
-      "or        %[a],            %[src0],              %[a]       \n\t"
-      "packushb  %[b],            %[b],                 %[r]       \n\t"
-      "packushb  %[g],            %[g],                 %[a]       \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
-      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
-      "daddiu    %[width],        %[width],            -0x04       \n\t"
-      "bgtz      %[width],        1b                               \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
-
-      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
-      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
-
-      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
-        [eleven] "f"(0x0b)
-      : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-
-  __asm__ volatile(
-      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "paddh      %[b],            %[b],              %[dither]     \n\t"
-      "paddh      %[g],            %[g],              %[dither]     \n\t"
-      "paddh      %[r],            %[r],              %[dither]     \n\t"
-      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[b]          \n\t"
-      "and        %[b],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[g]          \n\t"
-      "and        %[g],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[r]          \n\t"
-      "and        %[r],            %[src0],           %[c0]         \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
-        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
-      : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[three]      \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-      "psrlh      %[a],            %[a],              %[seven]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[ten]        \n\t"
-      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
-        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
-      : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[four]       \n\t"
-      "psrlh      %[g],            %[g],              %[four]       \n\t"
-      "psrlh      %[r],            %[r],              %[four]       \n\t"
-      "psrlh      %[a],            %[a],              %[four]       \n\t"
-
-      "psllh      %[g],            %[g],              %[four]       \n\t"
-      "psllh      %[r],            %[r],              %[eight]      \n\t"
-      "psllh      %[a],            %[a],              %[twelve]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
-        [twelve] "f"(0x0c)
-      : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0019008100420001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x00020070004a0026;
-  const uint64_t mask_v = 0x0012005e00700002;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x00020070004a0026;
-  const uint64_t mask_v = 0x0012005e00700002;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0042008100190001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb0])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb0])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb0],    %[src_argb0],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x00020070004a0026;
-  const uint64_t mask_v = 0x0012005e00700002;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[two]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest0, dest1, dest2, dest3;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift = 0x07;
-  const uint64_t value = 0x0040;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00010026004B000FULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x002b0054007f0002;
-  const uint64_t mask_v = 0x0002007f006b0014;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb0],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb0])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb0])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "pavgh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "pavgh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb0],     %[src_rgb0],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
-
-      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
-      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
-      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
-      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
-      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
-      "daddiu    %[width],        %[width],           -0x08          \n\t"
-      "bgtz      %[width],        1b                                 \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
-        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x1080108010801080;
-  uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
-      : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
-      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
-      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
-      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
-      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
-      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
-      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
-      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
-      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
-      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
-      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
-      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
-      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
-
-      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
-      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
-      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
-      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
-      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
-      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
-      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
-      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
-      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
-      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
-      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
-      "daddiu    %[width],        %[width],           -0x10             \n\t"
-      "bgtz      %[width],        1b                                    \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
-      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555),
-        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [two] "f"(0x02), [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
-      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
-      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
-
-      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
-      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
-      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_argb4444] "r"(src_argb4444),
-        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
-        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
-        [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x08              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
-        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
-        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
-        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
-        [dest3_v] "=&f"(ftmp[11])
-      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
-        [eight] "f"(0x08)
-      : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x01;
-  const uint64_t mask2 = 0x00400026004B000FULL;
-  const uint64_t mask3 = 0xFF000000FF000000ULL;
-  const uint64_t mask4 = ~mask3;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "and        %[src37],        %[src],            %[mask3]      \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
-      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
-      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask4]      \n\t"
-      "or         %[dest],         %[dest],           %[src37]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
-        [src37] "=&f"(src37)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
-      : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
-  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x002300440011ULL;
-  const uint64_t mask2 = 0x002D00580016ULL;
-  const uint64_t mask3 = 0x003200620018ULL;
-  const uint64_t mask4 = 0xFF000000FF000000ULL;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
-
-      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "or         %[dest],         %[dest],           %[dest37]     \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
-        [dest] "=&f"(dest)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [shift] "f"(shift)
-      : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
-      dest3;
-  uint64_t matrix, matrix_hi, matrix_lo;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift0 = 0x06;
-  const uint64_t shift1 = 0x08;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
-        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
-      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
-      : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
-        [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [value] "f"(value), [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
-  uint64_t dest, dest_lo, dest_hi;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
-      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width) {
-  uint64_t y00 = 0, y10 = 0, y20 = 0;
-  uint64_t y02 = 0, y12 = 0, y22 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                         \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
-      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
-      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
-      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
-      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
-      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
-      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
-      "daddiu    %[width],      %[width],      -8        \n\t"
-      "bgtz      %[width],      1b                       \n\t"
-      "nop                                               \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
-        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
-        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width) {
-  uint64_t y00 = 0, y01 = 0, y02 = 0;
-  uint64_t y10 = 0, y11 = 0, y12 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
-      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
-      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
-      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
-      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
-      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
-      "daddiu    %[width],      %[width],      -8       \n\t"
-      "bgtz      %[width],      1b                      \n\t"
-      "nop                                              \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
-        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
-        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  double temp[3];
-  uint64_t c1 = 0xff000000ff000000;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
-      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
-      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
-      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
-
-      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
-      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s1 s1 s1 s55 s0 s0 s0
-      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
-
-      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s3 s3 s3 255 s2 s2 s2
-      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
-
-      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
-      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
-
-      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  uint64_t tr = 0;
-  uint64_t tb = 0;
-  __asm__ volatile(
-      "1:	                                       \n\t"
-      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
-      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
-      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
-      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
-
-      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
-      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
-      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
-      "daddiu  %[width],      %[width],         -8     \n\t"
-      "bgtz    %[width],      1b                       \n\t"
-      "nop                                             \n\t"
-      : [tr] "=&f"(tr), [tb] "=&f"(tb)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_y] "r"(dst_y), [width] "r"(width)
-      : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t temp[3];
-  uint64_t result = 0;
-  uint64_t gb = 0;
-  uint64_t cr = 0;
-  uint64_t c1 = 0xffffffffffffffff;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
-      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
-
-      // g3 b3 g2 b2 g1 b1 g0 b0
-      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
-      // c3 r3 r2 r2 c1 r1 c0 r0
-      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
-      // c1 r1 g1 b1 c0 r0 g0 b0
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
-      // c3 r3 g3 b3 c2 r2 g2 b2
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
-
-      // g7 b7 g6 b6 g5 b5 g4 b4
-      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
-      // c7 r7 c6 r6 c5 r5 c4 r4
-      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
-      // c5 r5 g5 b5 c4 r4 g4 b4
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
-      // c7 r7 g7 b7 c6 r6 g6 b6
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
-        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  uint64_t src, dest;
-  const uint64_t mask0 = 0x00ffffff00ffffffULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
-  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x55;
-  const uint64_t mask2 = 0xAA;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = 0x4A354A354A354A35ULL;
-  const uint64_t mask5 = 0x0488048804880488ULL;
-  const uint64_t shift0 = 0x08;
-  const uint64_t shift1 = 0x06;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x1b;
-
-  src += width - 1;
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
-      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
-      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorUVRow_MMI(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src0, src1, dest0, dest1;
-  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
-  const uint64_t mask1 = 0x1b;
-  const uint64_t shift = 0x08;
-
-  src_uv += (width - 1) << 1;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
-      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
-      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
-      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
-
-      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
-
-      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
-      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
-      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
-        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  src += (width - 1) * 4;
-  uint64_t temp = 0x0;
-  uint64_t shuff = 0x4e;  // 01 00 11 10
-  __asm__ volatile(
-      "1:                                      \n\t"
-      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
-      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
-      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
-      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
-
-      "daddiu  %[src],   %[src],    -0x08      \n\t"
-      "daddiu  %[dst],   %[dst],     0x08      \n\t"
-      "daddiu  %[width], %[width],  -0x02      \n\t"
-      "bnez    %[width], 1b                    \n\t"
-      : [temp] "=&f"(temp)
-      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
-      : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                    \n\t"
-      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
-      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
-      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
-      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
-
-      "and      %[t2],     %[t0],          %[c0]    \n\t"
-      "and      %[t3],     %[t1],          %[c0]    \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
-
-      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
-      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
-
-      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
-      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
-      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
-      "daddiu   %[width],  %[width],      -8        \n\t"
-      "bgtz     %[width],  1b                       \n\t"
-      "nop                                          \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [t3] "=&f"(temp[3])
-      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  uint64_t temp[3];
-  __asm__ volatile(
-      "1:	                                 \n\t"
-      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
-      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
-      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
-      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
-      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
-      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
-
-      "daddiu    %[src_u],  %[src_u],      8     \n\t"
-      "daddiu    %[src_v],  %[src_v],      8     \n\t"
-      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
-      "daddiu    %[width],  %[width],     -8     \n\t"
-      "bgtz      %[width],  1b                   \n\t"
-      "nop                                       \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [width] "r"(width)
-      : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width) {
-  uint64_t src[4];
-  uint64_t dest_hi, dest_lo, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
-
-      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
-      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
-      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
-      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
-        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
-        [dstb_ptr] "r"(dst_b), [width] "r"(width)
-      : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width) {
-  uint64_t srcr, srcg, srcb, dest;
-  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
-  const uint64_t temp = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
-      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
-      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
-      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
-      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
-      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
-
-      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
-      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
-      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
-      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
-
-      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
-      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
-      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
-        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
-        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
-        [srcbz_lo] "=&f"(srcbz_lo)
-      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
-        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
-      : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                     \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
-      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],             %[c0]              \n\t"
-      "and      %[t1],         %[t1],             %[c0]              \n\t"
-      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
-      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
-      "packushb %[t0],         %[t0],             %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d0],         %[t0],             %[c1]              \n\t"
-      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]             \n\t"
-      "and      %[t1],         %[t1],              %[c0]             \n\t"
-      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
-      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
-      "packushb %[t0],         %[t0],              %[t1]             \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d2],         %[t0],              %[c1]             \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]             \n\t"
-      "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
-      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
-      "daddiu   %[width],      %[width],          -16                \n\t"
-      "bgtz     %[width],      1b                                    \n\t"
-      "nop                                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                     \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
-      "and      %[t0],       %[t0],            %[c0] \n\t"
-      "and      %[t1],       %[t1],            %[c0] \n\t"
-      "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
-      "daddiu   %[width],    %[width],        -8     \n\t"
-      "bgtz     %[width],    1b                      \n\t"
-      "nop                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0)
-      : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                      \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
-      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d0],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d2],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]              \n\t"
-      "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
-      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
-      "daddiu   %[width],      %[width],          -16                 \n\t"
-      "bgtz     %[width],      1b                                     \n\t"
-      "nop                                                            \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t shift = 0x08;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
-      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
-      "daddiu   %[width],    %[width],        -8        \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
-      dest_lo;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
-
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[mask4]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width) {
-  uint64_t source0, source1, dest, alph;
-  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
-      dest_lo;
-  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
-      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
-      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
-      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
-      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
-      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
-      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
-
-      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
-      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-
-      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
-      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
-        [alpha_r] "=&f"(alpha_rev)
-      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
-        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
-  const uint64_t mask0 = 0xFF;
-  const uint64_t mask1 = 0xFF000000FF000000ULL;
-  const uint64_t mask2 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask2]      \n\t"
-      "and        %[src],          %[src],            %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width) {
-  int64_t row_sum[2] = {0, 0};
-  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
-      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
-
-      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
-      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
-      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
-
-      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
-      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
-
-      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
-      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
-
-      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
-      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
-      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x01          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
-        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
-        [presrc1] "=&f"(presrc1)
-      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
-        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    __asm__ volatile(
-        "1:	                              \n\t"
-        "ld     $t0,        0x0(%[src_ptr])   \n\t"
-        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
-        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
-        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
-        "daddiu %[width],   %[width],      -8 \n\t"
-        "bgtz   %[width],   1b                \n\t"
-        "nop                                  \n\t"
-        :
-        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
-        : "memory");
-    return;
-  }
-  if (source_y_fraction == 128) {
-    uint64_t uv = 0x0;
-    uint64_t uv_stride = 0x0;
-    __asm__ volatile(
-        "1:	                                            \n\t"
-        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
-        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
-        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
-        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
-        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
-
-        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
-        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
-        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
-
-        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu  %[width],     %[width],      -8            \n\t"
-        "bgtz    %[width],     1b                           \n\t"
-        "nop                                                \n\t"
-        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
-        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-          [stride] "r"((int64_t)src_stride)
-        : "memory");
-    return;
-  }
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint64_t temp;
-  uint64_t data[4];
-  uint64_t zero = 0x0;
-  uint64_t c0 = 0x0080008000800080;
-  uint64_t fy0 = 0x0100010001000100;
-  uint64_t shift = 0x8;
-  __asm__ volatile(
-      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
-      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
-      "1:	                                        \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
-      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
-      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
-
-      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
-      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
-      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
-      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
-      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
-
-      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
-      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
-      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
-      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
-      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
-
-      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
-      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
-      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
-      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
-      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
-      "daddiu    %[width],    %[width],       -8        \n\t"
-      "bgtz      %[width],    1b                        \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
-        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
-        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
-        [shift] "f"(shift), [zero] "f"(zero)
-      : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
-                         ((shuffler[2] & 0x03) << 4) |
-                         ((shuffler[3] & 0x03) << 6);
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest;
-  const uint64_t mask0 = 0xff000000ff000000ULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[src],          %[src],            %[mask0]      \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[src],            %[dest]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
-  const uint64_t mask = 0xff000000ff000000ULL;
-  const uint64_t shift = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
deleted file mode 100644
index a12fa790..00000000
--- a/files/source/row_neon.cc
+++ /dev/null
@@ -1,2892 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#include <stdio.h>
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vld1.32    {d2[0]}, [%1]!                 \n" \
-  "vld1.32    {d2[1]}, [%2]!                 \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vld1.8     {d2}, [%1]!                    \n" \
-  "vld1.8     {d3}, [%2]!                    \n" \
-  "vpaddl.u8  q1, q1                         \n" \
-  "vrshrn.u16 d2, q1, #1                     \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                               \
-  "vld1.8     {d0}, [%0]!                    \n" \
-  "vmov.u8    d2, #128                       \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-  "vld1.8     {d0}, [%0]!                    \n"                               \
-  "vld1.8     {d2}, [%1]!                    \n"                               \
-  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
-  "vuzp.u8    d2, d3                         \n"                               \
-  "vtrn.u32   d2, d3                         \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-  "vld1.8     {d0}, [%0]!                    \n"                               \
-  "vld1.8     {d2}, [%1]!                    \n"                               \
-  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
-  "vuzp.u8    d3, d2                         \n"                               \
-  "vtrn.u32   d2, d3                         \n"
-
-// Read 8 YUY2
-#define READYUY2                                 \
-  "vld2.8     {d0, d2}, [%0]!                \n" \
-  "vmov.u8    d3, d2                         \n" \
-  "vuzp.u8    d2, d3                         \n" \
-  "vtrn.u32   d2, d3                         \n"
-
-// Read 8 UYVY
-#define READUYVY                                 \
-  "vld2.8     {d2, d3}, [%0]!                \n" \
-  "vmov.u8    d0, d3                         \n" \
-  "vmov.u8    d3, d2                         \n" \
-  "vuzp.u8    d2, d3                         \n" \
-  "vtrn.u32   d2, d3                         \n"
-
-#define YUVTORGB_SETUP                             \
-  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
-  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
-  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
-  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
-  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
-  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
-
-#define YUVTORGB                                                              \
-  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
-  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
-  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
-  "vmovl.s16  q10, d1                        \n"                              \
-  "vmovl.s16  q0, d0                         \n"                              \
-  "vmul.s32   q10, q10, q15                  \n"                              \
-  "vmul.s32   q0, q0, q15                    \n"                              \
-  "vqshrun.s32 d0, q0, #16                   \n"                              \
-  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
-  "vadd.s16   d18, d19                       \n"                              \
-  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
-  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
-  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
-  "vaddw.u16  q1, q1, d16                    \n"                              \
-  "vaddw.u16  q10, q10, d17                  \n"                              \
-  "vaddw.u16  q3, q3, d18                    \n"                              \
-  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
-  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
-  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
-  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
-  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
-  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
-  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
-  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
-  "vqshrun.s16 d21, q0, #6                   \n" /* G */
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
-      "1:                                        \n" READYUV444 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_argb),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %5, %5, #8                     \n"
-      "vld1.8     {d23}, [%3]!                   \n"
-      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(src_a),     // %3
-        "+r"(dst_argb),  // %4
-        "+r"(width)      // %5
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
-      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_rgba),  // %3
-        "+r"(width)      // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%3]!         \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_u),      // %1
-        "+r"(src_v),      // %2
-        "+r"(dst_rgb24),  // %3
-        "+r"(width)       // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTORGB565                                                        \
-  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
-  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
-  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
-  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
-  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n" ARGBTORGB565
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_u),       // %1
-        "+r"(src_v),       // %2
-        "+r"(dst_rgb565),  // %3
-        "+r"(width)        // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTOARGB1555                                                      \
-  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
-  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
-  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
-  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
-  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
-  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
-  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
-      "bgt        1b                             \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb1555),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-#define ARGBTOARGB4444                                                      \
-  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
-  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
-  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
-  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
-  "vorr       d0, d20, d21                   \n" /* BG                   */ \
-  "vorr       d1, d22, d23                   \n" /* RA                   */ \
-  "vzip.u8    d0, d1                         \n" /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
-      "1:                                        \n"
-
-      READYUV422 YUVTORGB
-      "subs       %4, %4, #8                     \n"
-      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
-      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
-      "bgt        1b                             \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb4444),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "vmov.u8    d23, #255                      \n"
-      "1:                                        \n" READYUV400 YUVTORGB
-      "subs       %2, %2, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
-        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
-        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
-        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8    d23, #255                      \n"
-      "1:                                        \n"
-      "vld1.8     {d20}, [%0]!                   \n"
-      "vmov       d21, d20                       \n"
-      "vmov       d22, d20                       \n"
-      "subs       %2, %2, #8                     \n"
-      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d20", "d21", "d22", "d23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
-               "1:                                        \n" READNV12 YUVTORGB
-               "subs       %3, %3, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-               "bgt        1b                             \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_uv),    // %1
-                 "+r"(dst_argb),  // %2
-                 "+r"(width)      // %3
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
-               "1:                                        \n" READNV21 YUVTORGB
-               "subs       %3, %3, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-               "bgt        1b                             \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_vu),    // %1
-                 "+r"(dst_argb),  // %2
-                 "+r"(width)      // %3
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-
-      YUVTORGB_SETUP
-
-      "1:                                        \n"
-
-      READNV12 YUVTORGB
-      "subs       %3, %3, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%2]!         \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_uv),     // %1
-        "+r"(dst_rgb24),  // %2
-        "+r"(width)       // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile(
-
-      YUVTORGB_SETUP
-
-      "1:                                        \n"
-
-      READNV21 YUVTORGB
-      "subs       %3, %3, #8                     \n"
-      "vst3.8     {d20, d21, d22}, [%2]!         \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_rgb24),  // %2
-        "+r"(width)       // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READNV12 YUVTORGB
-      "subs       %3, %3, #8                     \n" ARGBTORGB565
-      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_uv),      // %1
-        "+r"(dst_rgb565),  // %2
-        "+r"(width)        // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
-               "1:                                        \n" READYUY2 YUVTORGB
-               "subs       %2, %2, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-               "bgt        1b                             \n"
-               : "+r"(src_yuy2),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile(YUVTORGB_SETUP
-               "vmov.u8    d23, #255                      \n"
-               "1:                                        \n" READUYVY YUVTORGB
-               "subs       %2, %2, #8                     \n"
-               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-               "bgt        1b                             \n"
-               : "+r"(src_uyvy),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-                 [kUVToG] "r"(&yuvconstants->kUVToG),
-                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
-               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q0}, [%1]!                    \n"  // store U
-      "vst1.8     {q1}, [%2]!                    \n"  // store V
-      "bgt        1b                             \n"
-      : "+r"(src_uv),               // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load U
-      "vld1.8     {q1}, [%1]!                    \n"  // load V
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-      "bgt        1b                             \n"
-      : "+r"(src_u),                // %0
-        "+r"(src_v),                // %1
-        "+r"(dst_uv),               // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
-                      uint8_t* dst_r,
-                      uint8_t* dst_g,
-                      uint8_t* dst_b,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
-      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q0}, [%1]!                    \n"  // store R
-      "vst1.8     {q1}, [%2]!                    \n"  // store G
-      "vst1.8     {q2}, [%3]!                    \n"  // store B
-      "bgt        1b                             \n"
-      : "+r"(src_rgb),                    // %0
-        "+r"(dst_r),                      // %1
-        "+r"(dst_g),                      // %2
-        "+r"(dst_b),                      // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
-                      const uint8_t* src_g,
-                      const uint8_t* src_b,
-                      uint8_t* dst_rgb,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load R
-      "vld1.8     {q1}, [%1]!                    \n"  // load G
-      "vld1.8     {q2}, [%2]!                    \n"  // load B
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop
-      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
-      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
-      "bgt        1b                             \n"
-      : "+r"(src_r),                      // %0
-        "+r"(src_g),                      // %1
-        "+r"(src_b),                      // %2
-        "+r"(dst_rgb),                    // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-      "subs       %2, %2, #32                    \n"  // 32 processed per loop
-      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-      "bgt        1b                             \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
-      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
-      "1:                                        \n"
-      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-      "vst1.8    {q0}, [%0]!                     \n"  // store
-      "bgt       1b                              \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v8)      // %2
-      : "cc", "memory", "q0");
-}
-
-// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
-      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-      "1:                                        \n"
-      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-      "vst1.8    {q0}, [%0]!                     \n"  // store
-      "bgt       1b                              \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v32)     // %2
-      : "cc", "memory", "q0");
-}
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "mov        r3, #-16                       \n"
-      "add        %0, %0, %2                     \n"
-      "sub        %0, #16                        \n"
-
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-      "subs       %2, #16                        \n"  // 16 pixels per loop.
-      "vrev64.8   q0, q0                         \n"
-      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-      "vst1.8     {d0}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "r3", "q0");
-}
-
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      // Start at end of source row.
-      "mov        r12, #-16                      \n"
-      "add        %0, %0, %3, lsl #1             \n"
-      "sub        %0, #16                        \n"
-
-      "1:                                        \n"
-      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-      "subs       %3, #8                         \n"  // 8 pixels per loop.
-      "vrev64.8   q0, q0                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-      "vst1.8     {d1}, [%2]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_u),   // %1
-        "+r"(dst_v),   // %2
-        "+r"(width)    // %3
-      :
-      : "cc", "memory", "r12", "q0");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "mov        r3, #-16                       \n"
-      "add        %0, %0, %2, lsl #2             \n"
-      "sub        %0, #16                        \n"
-
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-      "subs       %2, #4                         \n"  // 4 pixels per loop.
-      "vrev64.32  q0, q0                         \n"
-      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-      "vst1.8     {d0}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "r3", "q0");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-      "vmov.u8    d4, #255                       \n"  // Alpha
-      "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8    d4, #255                       \n"  // Alpha
-      "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
-                                                      // RGB24.
-      "bgt        1b                             \n"
-      : "+r"(src_raw),    // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                        \
-  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
-  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
-  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
-  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
-  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
-  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
-  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
-  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
-  "vorr.u8    d1, d4, d6                     \n" /* G                    */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_argb),    // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                      \
-  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
-  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
-  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
-  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
-  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
-  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
-  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
-  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
-  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
-  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
-  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                        \
-  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
-  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
-  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
-  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
-  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
-  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
-  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
-  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
-  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
-  "vorr.u8    d1, d4, d6                     \n" /* G                    */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                      \
-  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
-  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
-  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
-  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
-  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
-  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
-  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
-  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "vmov.u8    d3, #255                       \n"  // Alpha
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
-                                                      // RGB24.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vswp.u8    d1, d3                         \n"  // swap R, B
-      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_raw),   // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-      "bgt        1b                             \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-      "bgt        1b                             \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // stride + src_yuy2
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
-      : "+r"(src_yuy2),     // %0
-        "+r"(stride_yuy2),  // %1
-        "+r"(dst_u),        // %2
-        "+r"(dst_v),        // %3
-        "+r"(width)         // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-        "d7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // stride + src_uyvy
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-      "bgt        1b                             \n"
-      : "+r"(src_uyvy),     // %0
-        "+r"(stride_uyvy),  // %1
-        "+r"(dst_u),        // %2
-        "+r"(dst_v),        // %3
-        "+r"(width)         // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-        "d7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-      "vld1.8     {q2}, [%3]                     \n"  // shuffler
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-      "subs       %2, %2, #4                     \n"  // 4 processed per loop
-      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),                   // %0
-        "+r"(dst_argb),                   // %1
-        "+r"(width)                       // %2
-      : "r"(shuffler)                     // %3
-      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-      "subs       %4, %4, #16                    \n"  // 16 pixels
-      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-      "subs       %4, %4, #16                    \n"  // 16 pixels
-      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
-                          uint8_t* dst_rgb565,
-                          int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGBTORGB565
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_rgb565),  // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
-                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "vdup.32    d2, %2                         \n"  // dither4
-      "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d20, d20, d2                   \n"
-      "vqadd.u8   d21, d21, d2                   \n"
-      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
-      ARGBTORGB565
-      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
-      "bgt        1b                             \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
-      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb1555,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGBTOARGB1555
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb1555),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb4444,
-                            int width) {
-  asm volatile(
-      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
-                                                      // vbic.
-      "1:                                        \n"
-      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGBTOARGB4444
-      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb4444),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
-      "bgt       1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlsl.u8   q2, d1, d25                    \n"  // G
-      "vmlsl.u8   q2, d2, d26                    \n"  // R
-      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
-
-      "vmull.u8   q3, d2, d24                    \n"  // R
-      "vmlsl.u8   q3, d1, d28                    \n"  // G
-      "vmlsl.u8   q3, d0, d27                    \n"  // B
-      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
-
-      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
-
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
-        "q15");
-}
-
-// clang-format off
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR)                                                 \
-  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
-  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
-  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
-  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
-  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
-  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
-  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
-  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
-  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
-  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q3, q2, q1)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
-                       int src_stride_rgb24,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_stride_rgb24),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
-                     int src_stride_raw,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    "1:                                        \n"
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_stride_raw),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
-                        int src_stride_rgb565,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
-      : "+r"(src_rgb565),         // %0
-        "+r"(src_stride_rgb565),  // %1
-        "+r"(dst_u),              // %2
-        "+r"(dst_v),              // %3
-        "+r"(width)               // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
-                          int src_stride_argb1555,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
-      : "+r"(src_argb1555),         // %0
-        "+r"(src_stride_argb1555),  // %1
-        "+r"(dst_u),                // %2
-        "+r"(dst_v),                // %3
-        "+r"(width)                 // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
-                          int src_stride_argb4444,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_argb
-      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-      "vmov.u16   q15, #0x8080                   \n"  // 128.5
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-      "vrshr.u16  q5, q5, #1                     \n"
-      "vrshr.u16  q6, q6, #1                     \n"
-
-      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-      "vmul.s16   q8, q4, q10                    \n"  // B
-      "vmls.s16   q8, q5, q11                    \n"  // G
-      "vmls.s16   q8, q6, q12                    \n"  // R
-      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-      "vmul.s16   q9, q6, q10                    \n"  // R
-      "vmls.s16   q9, q5, q14                    \n"  // G
-      "vmls.s16   q9, q4, q13                    \n"  // B
-      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-      "bgt        1b                             \n"
-      : "+r"(src_argb4444),         // %0
-        "+r"(src_stride_argb4444),  // %1
-        "+r"(dst_u),                // %2
-        "+r"(dst_v),                // %3
-        "+r"(width)                 // %4
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-      "vmov.u8    d27, #16                       \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d27                        \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d1, d4                     \n"  // R
-      "vmlal.u8   q8, d2, d5                     \n"  // G
-      "vmlal.u8   q8, d3, d6                     \n"  // B
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // R
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // B
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d1, d4                     \n"  // B
-      "vmlal.u8   q8, d2, d5                     \n"  // G
-      "vmlal.u8   q8, d3, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // B
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_y),      // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-      "vmov.u8    d7, #16                        \n"  // Add 16 constant
-      "1:                                        \n"
-      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q8, d0, d4                     \n"  // B
-      "vmlal.u8   q8, d1, d5                     \n"  // G
-      "vmlal.u8   q8, d2, d6                     \n"  // R
-      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-      "vqadd.u8   d0, d7                         \n"
-      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-      "bgt        1b                             \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_y),    // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  asm volatile(
-      "cmp        %4, #0                         \n"
-      "beq        100f                           \n"
-      "add        %2, %1                         \n"
-      "cmp        %4, #128                       \n"
-      "beq        50f                            \n"
-
-      "vdup.8     d5, %4                         \n"
-      "rsb        %4, #256                       \n"
-      "vdup.8     d4, %4                         \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "vld1.8     {q1}, [%2]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vmull.u8   q13, d0, d4                    \n"
-      "vmull.u8   q14, d1, d4                    \n"
-      "vmlal.u8   q13, d2, d5                    \n"
-      "vmlal.u8   q14, d3, d5                    \n"
-      "vrshrn.u16 d0, q13, #8                    \n"
-      "vrshrn.u16 d1, q14, #8                    \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        1b                             \n"
-      "b          99f                            \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "vld1.8     {q1}, [%2]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vrhadd.u8  q0, q1                         \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        50b                            \n"
-      "b          99f                            \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "vld1.8     {q0}, [%1]!                    \n"
-      "subs       %3, %3, #16                    \n"
-      "vst1.8     {q0}, [%0]!                    \n"
-      "bgt        100b                           \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),     // %0
-        "+r"(src_ptr),     // %1
-        "+r"(src_stride),  // %2
-        "+r"(dst_width),   // %3
-        "+r"(y1_fraction)  // %4
-      :
-      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "subs       %3, #8                         \n"
-      "blt        89f                            \n"
-      // Blend 8 pixels.
-      "8:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q10, d4, d3                    \n"  // db * a
-      "vmull.u8   q11, d5, d3                    \n"  // dg * a
-      "vmull.u8   q12, d6, d3                    \n"  // dr * a
-      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-      "vqadd.u8   d2, d2, d6                     \n"  // + sr
-      "vmov.u8    d3, #255                       \n"  // a = 255
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-      "bge        8b                             \n"
-
-      "89:                                       \n"
-      "adds       %3, #8-1                       \n"
-      "blt        99f                            \n"
-
-      // Blend 1 pixels.
-      "1:                                        \n"
-      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
-      "vmull.u8   q10, d4, d3                    \n"    // db * a
-      "vmull.u8   q11, d5, d3                    \n"    // dg * a
-      "vmull.u8   q12, d6, d3                    \n"    // dr * a
-      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
-      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
-      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
-      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
-      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
-      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
-      "vqadd.u8   d2, d2, d6                     \n"    // + sr
-      "vmov.u8    d3, #255                       \n"    // a = 255
-      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-      "bge        1b                             \n"
-
-      "99:                                         \n"
-
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      // Attenuate 8 pixels.
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q10, d0, d3                    \n"  // b * a
-      "vmull.u8   q11, d1, d3                    \n"  // g * a
-      "vmull.u8   q12, d2, d3                    \n"  // r * a
-      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "vdup.u16   q8, %2                         \n"
-      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-      "vdup.u16   q9, %3                         \n"  // interval multiply.
-      "vdup.u16   q10, %4                        \n"  // interval add
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-      "vmovl.u8   q1, d2                         \n"
-      "vmovl.u8   q2, d4                         \n"
-      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-      "vqdmulh.s16 q1, q1, q8                    \n"  // g
-      "vqdmulh.s16 q2, q2, q8                    \n"  // r
-      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-      "vmul.u16   q1, q1, q9                     \n"  // g
-      "vmul.u16   q2, q2, q9                     \n"  // r
-      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-      "vadd.u16   q1, q1, q10                    \n"  // g
-      "vadd.u16   q2, q2, q10                    \n"  // r
-      "vqmovn.u16 d0, q0                         \n"
-      "vqmovn.u16 d2, q1                         \n"
-      "vqmovn.u16 d4, q2                         \n"
-      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-      "vmovl.u8   q11, d22                       \n"
-      "vmovl.u8   q12, d24                       \n"
-      "vmovl.u8   q13, d26                       \n"
-      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-      "vqmovn.u16 d20, q10                       \n"
-      "vqmovn.u16 d22, q11                       \n"
-      "vqmovn.u16 d24, q12                       \n"
-      "vqmovn.u16 d26, q13                       \n"
-      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d24                    \n"  // B
-      "vmlal.u8   q2, d1, d25                    \n"  // G
-      "vmlal.u8   q2, d2, d26                    \n"  // R
-      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-      "vmov       d1, d0                         \n"  // G
-      "vmov       d2, d0                         \n"  // R
-      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "vmov.u8    d20, #17                       \n"  // BB coefficient
-      "vmov.u8    d21, #68                       \n"  // BG coefficient
-      "vmov.u8    d22, #35                       \n"  // BR coefficient
-      "vmov.u8    d24, #22                       \n"  // GB coefficient
-      "vmov.u8    d25, #88                       \n"  // GG coefficient
-      "vmov.u8    d26, #45                       \n"  // GR coefficient
-      "vmov.u8    d28, #24                       \n"  // BB coefficient
-      "vmov.u8    d29, #98                       \n"  // BG coefficient
-      "vmov.u8    d30, #50                       \n"  // BR coefficient
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-      "vmlal.u8   q2, d1, d21                    \n"  // G
-      "vmlal.u8   q2, d2, d22                    \n"  // R
-      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-      "vmlal.u8   q3, d1, d25                    \n"  // G
-      "vmlal.u8   q3, d2, d26                    \n"  // R
-      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-      "vmlal.u8   q8, d1, d29                    \n"  // G
-      "vmlal.u8   q8, d2, d30                    \n"  // R
-      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(dst_argb),  // %0
-        "+r"(width)      // %1
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
-        "q14", "q15");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const int8_t* matrix_argb,
-                             int width) {
-  asm volatile(
-      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-      "vmovl.u8   q9, d18                        \n"  // g
-      "vmovl.u8   q10, d20                       \n"  // r
-      "vmovl.u8   q11, d22                       \n"  // a
-      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-        "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vmull.u8   q0, d0, d1                     \n"  // multiply B
-      "vmull.u8   q1, d2, d3                     \n"  // multiply G
-      "vmull.u8   q2, d4, d5                     \n"  // multiply R
-      "vmull.u8   q3, d6, d7                     \n"  // multiply A
-      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "vmov.u8    d3, #255                       \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d0, d0, d1                     \n"  // add
-      "vmov.u8    d1, d0                         \n"
-      "vmov.u8    d2, d0                         \n"
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      // 16 pixel loop.
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-      "vqadd.u8   q0, q0, q1                     \n"  // add
-      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "vmov.u8    d3, #255                       \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vqadd.u8   d1, d0, d2                     \n"  // add
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-      "bgt        1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "q0", "q1");
-}
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {d0}, [%0],%5                  \n"  // top
-      "vld1.8     {d1}, [%0],%6                  \n"
-      "vsubl.u8   q0, d0, d1                     \n"
-      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-      "vld1.8     {d3}, [%1],%6                  \n"
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-      "vld1.8     {d3}, [%2],%6                  \n"
-      "subs       %4, %4, #8                     \n"  // 8 pixels
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vabs.s16   q0, q0                         \n"
-      "vqmovn.u16 d0, q0                         \n"
-      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-      "bgt        1b                             \n"
-      : "+r"(src_y0),               // %0
-        "+r"(src_y1),               // %1
-        "+r"(src_y2),               // %2
-        "+r"(dst_sobelx),           // %3
-        "+r"(width)                 // %4
-      : "r"(2),                     // %5
-        "r"(6)                      // %6
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {d0}, [%0],%4                  \n"  // left
-      "vld1.8     {d1}, [%1],%4                  \n"
-      "vsubl.u8   q0, d0, d1                     \n"
-      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-      "vld1.8     {d3}, [%1],%4                  \n"
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vld1.8     {d2}, [%0],%5                  \n"  // right
-      "vld1.8     {d3}, [%1],%5                  \n"
-      "subs       %3, %3, #8                     \n"  // 8 pixels
-      "vsubl.u8   q1, d2, d3                     \n"
-      "vadd.s16   q0, q0, q1                     \n"
-      "vabs.s16   q0, q0                         \n"
-      "vqmovn.u16 d0, q0                         \n"
-      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-      "bgt        1b                             \n"
-      : "+r"(src_y0),               // %0
-        "+r"(src_y1),               // %1
-        "+r"(dst_sobely),           // %2
-        "+r"(width)                 // %3
-      : "r"(1),                     // %4
-        "r"(6)                      // %5
-      : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// %y passes a float as a scalar vector for vector * scalar multiply.
-// the regoster must be d0 to d15 and indexed with [0] or [1] to access
-// the float in the first or second float of the d-reg
-
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u16  q2, d2                         \n"  // 8 int's
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
-      "vqshrn.u32 d3, q3, #13                    \n"
-      "vst1.8     {q1}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src),              // %0
-        "+r"(dst),              // %1
-        "+r"(width)             // %2
-      : "w"(1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u16  q2, d2                         \n"  // 8 int's
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
-      "vqshrn.u32 d3, q3, #13                    \n"
-      "vst1.8     {q1}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
-      : "w"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
-                         float* dst,
-                         float scale,
-                         int width) {
-  asm volatile(
-
-      "1:                                        \n"
-      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
-      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-      "vmovl.u8   q1, d2                         \n"  // 8 shorts
-      "vmovl.u16  q2, d2                         \n"  // 8 ints
-      "vmovl.u16  q3, d3                         \n"
-      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
-      "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, %y3                    \n"  // scale
-      "vmul.f32   q3, q3, %y3                    \n"
-      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
-      "bgt        1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
-                   const uint16_t* src1,
-                   const uint16_t* src2,
-                   const uint16_t* src3,
-                   const uint16_t* src4,
-                   uint32_t* dst,
-                   int width) {
-  asm volatile(
-      "vmov.u16   d6, #4                         \n"  // constant 4
-      "vmov.u16   d7, #6                         \n"  // constant 6
-
-      "1:                                        \n"
-      "vld1.16    {q1}, [%0]!                    \n"  // load 8 samples, 5 rows
-      "vld1.16    {q2}, [%4]!                    \n"
-      "vaddl.u16  q0, d2, d4                     \n"  // * 1
-      "vaddl.u16  q1, d3, d5                     \n"  // * 1
-      "vld1.16    {q2}, [%1]!                    \n"
-      "vmlal.u16  q0, d4, d6                     \n"  // * 4
-      "vmlal.u16  q1, d5, d6                     \n"  // * 4
-      "vld1.16    {q2}, [%2]!                    \n"
-      "vmlal.u16  q0, d4, d7                     \n"  // * 6
-      "vmlal.u16  q1, d5, d7                     \n"  // * 6
-      "vld1.16    {q2}, [%3]!                    \n"
-      "vmlal.u16  q0, d4, d6                     \n"  // * 4
-      "vmlal.u16  q1, d5, d6                     \n"  // * 4
-      "subs       %6, %6, #8                     \n"  // 8 processed per loop
-      "vst1.32    {q0, q1}, [%5]!                \n"  // store 8 samples
-      "bgt        1b                             \n"
-      : "+r"(src0),  // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(src4),  // %4
-        "+r"(dst),   // %5
-        "+r"(width)  // %6
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
-  const uint32_t* src1 = src + 1;
-  const uint32_t* src2 = src + 2;
-  const uint32_t* src3 = src + 3;
-  asm volatile(
-      "vmov.u32    q10, #4                        \n"  // constant 4
-      "vmov.u32    q11, #6                        \n"  // constant 6
-
-      "1:                                        \n"
-      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
-      "vld1.32     {q2}, [%0]                    \n"
-      "vadd.u32    q0, q0, q1                    \n"  // * 1
-      "vadd.u32    q1, q1, q2                    \n"  // * 1
-      "vld1.32     {q2, q3}, [%2]!               \n"
-      "vmla.u32    q0, q2, q11                   \n"  // * 6
-      "vmla.u32    q1, q3, q11                   \n"  // * 6
-      "vld1.32     {q2, q3}, [%1]!               \n"
-      "vld1.32     {q8, q9}, [%3]!               \n"
-      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
-      "vadd.u32    q3, q3, q9                    \n"
-      "vmla.u32    q0, q2, q10                   \n"  // * 4
-      "vmla.u32    q1, q3, q10                   \n"  // * 4
-      "subs        %5, %5, #8                    \n"  // 8 processed per loop
-      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
-      "vqshrn.u32  d1, q1, #8                    \n"
-      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
-      "bgt         1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(dst),   // %4
-        "+r"(width)  // %5
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  asm volatile(
-      "1:                                          \n"
-      "vld1.8    {q2}, [%0]!                     \n"  // load 16 Y values
-      "vld2.8    {d0, d2}, [%1]!                 \n"  // load 8 VU values
-      "vmov      d1, d0                          \n"
-      "vzip.u8   d0, d1                          \n"  // VV
-      "vmov      d3, d2                          \n"
-      "vzip.u8   d2, d3                          \n"  // UU
-      "subs      %3, %3, #16                     \n"  // 16 pixels per loop
-      "vst3.8    {d0, d2, d4}, [%2]!             \n"  // store 16 YUV pixels
-      "vst3.8    {d1, d3, d5}, [%2]!             \n"
-      "bgt        1b                             \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_yuv24),  // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_uv,
-                      int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
-                                                      // pixels.
-      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
-      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
-      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
-                                                      // pixels.
-      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
-                                                      // pixels.
-      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
-      "vqrshrun.s16 d0, q1, #2                   \n"
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels UV.
-      "bgt        1b                             \n"
-      : "+r"(src_ayuv),         // %0
-        "+r"(src_stride_ayuv),  // %1
-        "+r"(dst_uv),           // %2
-        "+r"(width)             // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_vu,
-                      int width) {
-  asm volatile(
-      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
-                                                      // pixels.
-      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
-      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
-      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
-                                                      // pixels.
-      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
-                                                      // pixels.
-      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
-      "vqrshrun.s16 d1, q1, #2                   \n"
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels VU.
-      "bgt        1b                             \n"
-      : "+r"(src_ayuv),         // %0
-        "+r"(src_stride_ayuv),  // %1
-        "+r"(dst_vu),           // %2
-        "+r"(width)             // %3
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-}
-
-// Copy row of AYUV Y's into Y.
-// Similar to ARGBExtractAlphaRow_NEON
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                          \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q2}, [%1]!                    \n"  // store 16 Y's.
-      "bgt       1b                              \n"
-      : "+r"(src_ayuv),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2", "q3");
-}
-
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-      "1:                                          \n"
-      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 UV values
-      "vld2.8     {d1, d3}, [%0]!                \n"
-      "vorr.u8    q2, q0, q0                     \n"  // move U after V
-      "subs       %2, %2, #16                    \n"  // 16 pixels per loop
-      "vst2.8     {q1, q2}, [%1]!                \n"  // store 16 VU pixels
-      "bgt        1b                             \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_vu),  // %1
-        "+r"(width)    // %2
-      :
-      : "cc", "memory", "q0", "q1", "q2");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
deleted file mode 100644
index f5cbb470..00000000
--- a/files/source/row_neon64.cc
+++ /dev/null
@@ -1,3036 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v1.s}[0], [%1], #4            \n" \
-  "ld1        {v1.s}[1], [%2], #4            \n"
-
-// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v1.d}[0], [%1], #8            \n" \
-  "ld1        {v1.d}[1], [%2], #8            \n" \
-  "uaddlp     v1.8h, v1.16b                  \n" \
-  "rshrn      v1.8b, v1.8h, #1               \n"
-
-// Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                               \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "movi       v1.8b , #128                   \n"
-
-// Read 8 Y and 4 UV from NV12
-#define READNV12                                 \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v2.8b}, [%1], #8              \n" \
-  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21                                 \
-  "ld1        {v0.8b}, [%0], #8              \n" \
-  "ld1        {v2.8b}, [%1], #8              \n" \
-  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 YUY2
-#define READYUY2                                 \
-  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
-  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
-  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-// Read 8 UYVY
-#define READUYVY                                 \
-  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
-  "orr        v0.8b, v3.8b, v3.8b            \n" \
-  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
-  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
-  "ins        v1.s[1], v3.s[0]               \n"
-
-#define YUVTORGB_SETUP                           \
-  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
-  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
-  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
-  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
-  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
-  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
-
-#define YUVTORGB(vR, vG, vB)                                        \
-  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
-  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
-  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
-  "ushll      v0.4s, v0.4h, #0               \n"                    \
-  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
-  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
-  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
-  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
-  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
-  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
-  "uxtl       v2.8h, v2.8b                   \n"                    \
-  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
-  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
-  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
-  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
-  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
-  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
-  "sqadd      " #vB                                                 \
-  ".8h, v24.8h, v0.8h      \n" /* B */                              \
-  "sqadd      " #vG                                                 \
-  ".8h, v25.8h, v0.8h      \n" /* G */                              \
-  "sqadd      " #vR                                                 \
-  ".8h, v26.8h, v0.8h      \n" /* R */                              \
-  "sqadd      " #vB ".8h, " #vB                                     \
-  ".8h, v3.8h  \n" /* B */                                          \
-  "sqsub      " #vG ".8h, " #vG                                     \
-  ".8h, v6.8h  \n" /* G */                                          \
-  "sqadd      " #vR ".8h, " #vR                                     \
-  ".8h, v7.8h  \n" /* R */                                          \
-  "sqshrun    " #vB ".8b, " #vB                                     \
-  ".8h, #6     \n" /* B */                                          \
-  "sqshrun    " #vG ".8b, " #vG                                     \
-  ".8h, #6     \n"                               /* G */            \
-  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
-
-void I444ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             const uint8_t* src_a,
-                             uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "ld1        {v23.8b}, [%3], #8             \n"
-    "subs       %w5, %w5, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v20.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v23, v22, v21)
-    "subs       %w4, %w4, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgb24), // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-#define ARGBTORGB565                                                        \
-  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
-  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
-  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
-  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_u,
-                          const uint8_t* src_v,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READYUV422 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
-               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_u),       // %1
-        "+r"(src_v),       // %2
-        "+r"(dst_rgb565),  // %3
-        "+r"(width)        // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-#define ARGBTOARGB1555                                                      \
-  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
-  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
-  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
-  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
-  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
-  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb1555,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi       v23.8b, #255                   \n"
-      "1:                                        \n" READYUV422 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
-               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
-      : "+r"(src_y),         // %0
-        "+r"(src_u),         // %1
-        "+r"(src_v),         // %2
-        "+r"(dst_argb1555),  // %3
-        "+r"(width)          // %4
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-#define ARGBTOARGB4444                                                       \
-  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
-  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
-  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
-  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
-  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
-  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
-  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            uint8_t* dst_argb4444,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    "movi       v23.8b, #255                   \n"
-    ARGBTOARGB4444
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUV400
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi       v23.8b, #255                   \n"
-      "1:                                        \n"
-      "ld1        {v20.8b}, [%0], #8             \n"
-      "orr        v21.8b, v20.8b, v20.8b         \n"
-      "orr        v22.8b, v20.8b, v20.8b         \n"
-      "subs       %w2, %w2, #8                   \n"
-      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v20", "v21", "v22", "v23");
-}
-
-void NV12ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void NV21ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READNV21
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void NV12ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV12
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb24),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void NV21ToRGB24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_rgb24,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV21
-    YUVTORGB(v22, v21, v20)
-    "subs       %w3, %w3, #8                   \n"
-    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_rgb24),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void NV12ToRGB565Row_NEON(const uint8_t* src_y,
-                          const uint8_t* src_uv,
-                          uint8_t* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "1:                                        \n" READNV12 YUVTORGB(
-          v22, v21,
-          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
-               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
-                                                               // RGB565.
-               "b.gt       1b                             \n"
-      : "+r"(src_y),       // %0
-        "+r"(src_uv),      // %1
-        "+r"(dst_rgb565),  // %2
-        "+r"(width)        // %3
-      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
-        [kUVToG] "r"(&yuvconstants->kUVToG),
-        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
-        [kYToRgb] "r"(&yuvconstants->kYToRgb)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
-}
-
-void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "st1        {v0.16b}, [%1], #16            \n"  // store U
-      "st1        {v1.16b}, [%2], #16            \n"  // store V
-      "b.gt       1b                             \n"
-      : "+r"(src_uv),               // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load U
-      "ld1        {v1.16b}, [%1], #16            \n"  // load V
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-      "b.gt       1b                             \n"
-      : "+r"(src_u),                // %0
-        "+r"(src_v),                // %1
-        "+r"(dst_uv),               // %2
-        "+r"(width)                 // %3  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
-void SplitRGBRow_NEON(const uint8_t* src_rgb,
-                      uint8_t* dst_r,
-                      uint8_t* dst_g,
-                      uint8_t* dst_b,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
-      "st1        {v0.16b}, [%1], #16            \n"  // store R
-      "st1        {v1.16b}, [%2], #16            \n"  // store G
-      "st1        {v2.16b}, [%3], #16            \n"  // store B
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb),                    // %0
-        "+r"(dst_r),                      // %1
-        "+r"(dst_g),                      // %2
-        "+r"(dst_b),                      // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
-void MergeRGBRow_NEON(const uint8_t* src_r,
-                      const uint8_t* src_g,
-                      const uint8_t* src_b,
-                      uint8_t* dst_rgb,
-                      int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load R
-      "ld1        {v1.16b}, [%1], #16            \n"  // load G
-      "ld1        {v2.16b}, [%2], #16            \n"  // load B
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
-      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
-      "b.gt       1b                             \n"
-      : "+r"(src_r),                      // %0
-        "+r"(src_g),                      // %1
-        "+r"(src_b),                      // %2
-        "+r"(dst_rgb),                    // %3
-        "+r"(width)                       // %4
-      :                                   // Input registers
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// Copy multiple of 32.
-void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ldp        q0, q1, [%0], #32              \n"
-      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-      "stp        q0, q1, [%1], #32              \n"
-      "b.gt       1b                             \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2  // Output registers
-      :                             // Input registers
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// SetRow writes 'width' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
-      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
-      "1:                                        \n"
-      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
-      "st1        {v0.16b}, [%0], #16            \n"  // store
-      "b.gt       1b                             \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v8)      // %2
-      : "cc", "memory", "v0");
-}
-
-void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
-      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-      "1:                                        \n"
-      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
-      "st1        {v0.16b}, [%0], #16            \n"  // store
-      "b.gt       1b                             \n"
-      : "+r"(dst),   // %0
-        "+r"(width)  // %1
-      : "r"(v32)     // %2
-      : "cc", "memory", "v0");
-}
-
-void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "add        %0, %0, %w2, sxtw              \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-      "rev64      v0.16b, v0.16b                 \n"
-      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-      "st1        {v0.D}[0], [%1], #8            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(width)          // %2
-      : "r"((ptrdiff_t)-16)  // %3
-      : "cc", "memory", "v0");
-}
-
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  asm volatile(
-      // Start at end of source row.
-      "add        %0, %0, %w3, sxtw #1           \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
-      "rev64      v0.8b, v0.8b                   \n"
-      "rev64      v1.8b, v1.8b                   \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-      "st1        {v1.8b}, [%2], #8              \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_u),         // %1
-        "+r"(dst_v),         // %2
-        "+r"(width)          // %3
-      : "r"((ptrdiff_t)-16)  // %4
-      : "cc", "memory", "v0", "v1");
-}
-
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      // Start at end of source row.
-      "add        %0, %0, %w2, sxtw #2           \n"
-      "sub        %0, %0, #16                    \n"
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-      "rev64      v0.4s, v0.4s                   \n"
-      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-      "st1        {v0.D}[0], [%1], #8            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(width)          // %2
-      : "r"((ptrdiff_t)-16)  // %3
-      : "cc", "memory", "v0");
-}
-
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  asm volatile(
-      "movi       v4.8b, #255                    \n"  // Alpha
-      "1:                                        \n"
-      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi       v5.8b, #255                    \n"  // Alpha
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-      "b.gt       1b                             \n"
-      : "+r"(src_raw),   // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-      "b.gt       1b                             \n"
-      : "+r"(src_raw),    // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                        \
-  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
-  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
-  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
-  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
-  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
-  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
-  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
-  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
-
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      "movi       v3.8b, #255                    \n"  // Alpha
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_argb),    // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                      \
-  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
-  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
-  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
-                                                                            \
-  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
-  "xtn2       v3.16b, v2.8h                  \n"                            \
-                                                                            \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
-                                                                            \
-  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
-  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
-  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
-                                                                            \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
-  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
-  "dup        v1.2D, v0.D[1]                 \n"                            \
-  "dup        v3.2D, v2.D[1]                 \n"
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                        \
-  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
-  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
-  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
-                                                                            \
-  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
-  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
-                                                                            \
-  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
-  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
-  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
-                                                                            \
-  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
-  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
-  "dup        v1.2D, v0.D[1]                 \n" /* G */
-
-void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "movi       v3.8b, #255                    \n"  // Alpha
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                      \
-  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
-  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
-  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
-  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
-  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
-  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
-  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
-  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
-  "dup        v0.2D, v2.D[1]                 \n"                            \
-  "dup        v1.2D, v3.D[1]                 \n"
-
-void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_argb),      // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
-                                                      // RGB24.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_rgb24),  // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_raw),   // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
-      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-      "b.gt       1b                             \n"
-      : "+r"(src_yuy2),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
-      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-      "b.gt       1b                             \n"
-      : "+r"(src_uyvy),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
-                      int stride_yuy2,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
-      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
-      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
-      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
-      "b.gt       1b                             \n"
-      : "+r"(src_yuy2),   // %0
-        "+r"(src_yuy2b),  // %1
-        "+r"(dst_u),      // %2
-        "+r"(dst_v),      // %3
-        "+r"(width)       // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-        "v7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
-                      int stride_uyvy,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
-      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
-      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
-      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
-      "b.gt       1b                             \n"
-      : "+r"(src_uyvy),   // %0
-        "+r"(src_uyvyb),  // %1
-        "+r"(dst_u),      // %2
-        "+r"(dst_v),      // %3
-        "+r"(width)       // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-        "v7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  asm volatile(
-      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),                   // %0
-        "+r"(dst_argb),                   // %1
-        "+r"(width)                       // %2
-      : "r"(shuffler)                     // %3
-      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_yuy2,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-      "orr        v2.8b, v1.8b, v1.8b            \n"
-      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
-      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
-      "subs       %w4, %w4, #16                  \n"        // 16 pixels
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt       1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void I422ToUYVYRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_uyvy,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-      "orr        v3.8b, v2.8b, v2.8b            \n"
-      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
-      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
-      "subs       %w4, %w4, #16                  \n"        // 16 pixels
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-      "b.gt       1b                             \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+r"(width)      // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
-                          uint8_t* dst_rgb565,
-                          int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGBTORGB565
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_rgb565),  // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
-                                uint8_t* dst_rgb,
-                                const uint32_t dither4,
-                                int width) {
-  asm volatile(
-      "dup        v1.4s, %w2                     \n"  // dither4
-      "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v20.8b, v20.8b, v1.8b          \n"
-      "uqadd      v21.8b, v21.8b, v1.8b          \n"
-      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
-      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-      "b.gt       1b                             \n"
-      : "+r"(dst_rgb)   // %0
-      : "r"(src_argb),  // %1
-        "r"(dither4),   // %2
-        "r"(width)      // %3
-      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb1555,
-                            int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGBTOARGB1555
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
-                                                      // ARGB1555.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb1555),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_argb4444,
-                            int width) {
-  asm volatile(
-      "movi       v4.16b, #0x0f                  \n"  // bits to clear with
-                                                      // vbic.
-      "1:                                        \n"
-      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGBTOARGB4444
-      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
-                                                      // ARGB4444.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_argb4444),  // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
-                              uint8_t* dst_a,
-                              int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
-                                                                // pixels
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_a),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8_t* src_argb,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "movi       v24.8b, #112                   \n"  // UB / VR 0.875
-                                                      // coefficient
-      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-      "movi       v29.16b,#0x80                  \n"  // 128.5
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-                                                            // pixels.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
-
-      "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
-
-      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
-        "v27", "v28", "v29");
-}
-
-#define RGBTOUV_SETUP_REG                                                  \
-  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
-  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
-  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
-  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
-  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
-  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-// clang-format off
-#define RGBTOUV(QB, QG, QR)                                                 \
-  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
-  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
-  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
-  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
-  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
-  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
-  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
-  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
-  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
-  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
-// clang-format on
-
-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-// TODO(fbarchard): consider ptrdiff_t for all strides.
-
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8_t* src_argb,
-                       int src_stride_argb,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
-    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
-    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
-    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
-    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
-    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v3.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_bgra_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
-    "urshr      v2.8h, v2.8h, #1               \n"
-    "urshr      v1.8h, v1.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v2.8h, v1.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_abgr_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_rgba_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
-                       int src_stride_rgb24,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
-    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(src_rgb24_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RAWToUVRow_NEON(const uint8_t* src_raw,
-                     int src_stride_raw,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
-    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
-    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v0.8h, v0.8h, #1               \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
-    RGBTOUV(v2.8h, v1.8h, v0.8h)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(src_raw_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
-                        int src_stride_rgb565,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile(
-      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
-                                                      // 2
-      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
-      RGB565TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
-      RGB565TOARGB
-      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v17.D[0]             \n"
-      "ins        v18.D[1], v19.D[0]             \n"
-      "ins        v20.D[1], v21.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v18.8h, #1              \n"
-      "urshr      v6.8h, v20.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-      "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-      "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-      "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-      "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-      "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb565),    // %0
-        "+r"(src_rgb565_1),  // %1
-        "+r"(dst_u),         // %2
-        "+r"(dst_v),         // %3
-        "+r"(width)          // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-        "v27");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
-                          int src_stride_argb1555,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile(
-      RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
-      RGB555TOARGB
-      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v26.D[0]             \n"
-      "ins        v17.D[1], v27.D[0]             \n"
-      "ins        v18.D[1], v28.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v17.8h, #1              \n"
-      "urshr      v6.8h, v18.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb1555),    // %0
-        "+r"(src_argb1555_1),  // %1
-        "+r"(dst_u),           // %2
-        "+r"(dst_v),           // %3
-        "+r"(width)            // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28");
-}
-
-// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
-                          int src_stride_argb4444,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile(
-      RGBTOUV_SETUP_REG
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TOARGB
-      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-      "ins        v16.D[1], v26.D[0]             \n"
-      "ins        v17.D[1], v27.D[0]             \n"
-      "ins        v18.D[1], v28.D[0]             \n"
-
-      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-      "urshr      v5.8h, v17.8h, #1              \n"
-      "urshr      v6.8h, v18.8h, #1              \n"
-
-      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb4444),    // %0
-        "+r"(src_argb4444_1),  // %1
-        "+r"(dst_u),           // %2
-        "+r"(dst_v),           // %3
-        "+r"(width)            // %4
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28"
-
-  );
-}
-
-void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-      "movi       v27.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      RGB565TOARGB
-      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v27.8b           \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb565),  // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
-        "v27");
-}
-
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGB1555TOARGB
-      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb1555),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
-                         uint8_t* dst_y,
-                         int width) {
-  asm volatile(
-      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-      "movi       v27.8b, #16                    \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      ARGB4444TOARGB
-      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v27.8b           \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb4444),  // %0
-        "+r"(dst_y),         // %1
-        "+r"(width)          // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
-}
-
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_bgra),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_abgr),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_rgba),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_rgb24),  // %0
-        "+r"(dst_y),      // %1
-        "+r"(width)       // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-      "movi       v7.8b, #16                     \n"  // Add 16 constant
-      "1:                                        \n"
-      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-      "uqadd      v0.8b, v0.8b, v7.8b            \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-      "b.gt       1b                             \n"
-      : "+r"(src_raw),  // %0
-        "+r"(dst_y),    // %1
-        "+r"(width)     // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
-      "cmp        %w4, #0                        \n"
-      "b.eq       100f                           \n"
-      "cmp        %w4, #128                      \n"
-      "b.eq       50f                            \n"
-
-      "dup        v5.16b, %w4                    \n"
-      "dup        v4.16b, %w5                    \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "ld1        {v1.16b}, [%2], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "umull      v2.8h, v0.8b,  v4.8b           \n"
-      "umull2     v3.8h, v0.16b, v4.16b          \n"
-      "umlal      v2.8h, v1.8b,  v5.8b           \n"
-      "umlal2     v3.8h, v1.16b, v5.16b          \n"
-      "rshrn      v0.8b,  v2.8h, #8              \n"
-      "rshrn2     v0.16b, v3.8h, #8              \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       1b                             \n"
-      "b          99f                            \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "ld1        {v1.16b}, [%2], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       50b                            \n"
-      "b          99f                            \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1        {v0.16b}, [%1], #16            \n"
-      "subs       %w3, %w3, #16                  \n"
-      "st1        {v0.16b}, [%0], #16            \n"
-      "b.gt       100b                           \n"
-
-      "99:                                       \n"
-      : "+r"(dst_ptr),      // %0
-        "+r"(src_ptr),      // %1
-        "+r"(src_ptr1),     // %2
-        "+r"(dst_width),    // %3
-        "+r"(y1_fraction),  // %4
-        "+r"(y0_fraction)   // %5
-      :
-      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
-}
-
-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
-                       const uint8_t* src_argb1,
-                       uint8_t* dst_argb,
-                       int width) {
-  asm volatile(
-      "subs       %w3, %w3, #8                   \n"
-      "b.lt       89f                            \n"
-      // Blend 8 pixels.
-      "8:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
-                                                            // pixels
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
-                                                            // pixels
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-      "movi       v3.8b, #255                    \n"  // a = 255
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.ge       8b                             \n"
-
-      "89:                                       \n"
-      "adds       %w3, %w3, #8-1                 \n"
-      "b.lt       99f                            \n"
-
-      // Blend 1 pixels.
-      "1:                                        \n"
-      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-      "movi       v3.8b, #255                    \n"  // a = 255
-      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-      "b.ge       1b                             \n"
-
-      "99:                                       \n"
-
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18");
-}
-
-// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_argb,
-                           int width) {
-  asm volatile(
-      // Attenuate 8 pixels.
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Quantize 8 ARGB pixels (32 bytes).
-// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
-                          int scale,
-                          int interval_size,
-                          int interval_offset,
-                          int width) {
-  asm volatile(
-      "dup        v4.8h, %w2                     \n"
-      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-      "dup        v5.8h, %w3                     \n"  // interval multiply.
-      "dup        v6.8h, %w4                     \n"  // interval add
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
-      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
-      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
-      "uxtl       v1.8h, v1.8b                   \n"
-      "uxtl       v2.8h, v2.8b                   \n"
-      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-      "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-      "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-      "add        v1.8h, v1.8h, v6.8h            \n"  // g
-      "add        v2.8h, v2.8h, v6.8h            \n"  // r
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "uqxtn      v1.8b, v1.8h                   \n"
-      "uqxtn      v2.8b, v2.8h                   \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(dst_argb),       // %0
-        "+r"(width)           // %1
-      : "r"(scale),           // %2
-        "r"(interval_size),   // %3
-        "r"(interval_offset)  // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Shade 8 pixels at a time by specified value.
-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8_t* src_argb,
-                       uint8_t* dst_argb,
-                       int width,
-                       uint32_t value) {
-  asm volatile(
-      "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
-
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-      "uxtl       v5.8h, v5.8b                   \n"
-      "uxtl       v6.8h, v6.8b                   \n"
-      "uxtl       v7.8h, v7.8b                   \n"
-      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-      "uqxtn      v4.8b, v4.8h                   \n"
-      "uqxtn      v5.8b, v5.8h                   \n"
-      "uqxtn      v6.8b, v6.8h                   \n"
-      "uqxtn      v7.8b, v7.8h                   \n"
-      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(value)       // %3
-      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
-}
-
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
-}
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-
-void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movi       v20.8b, #17                    \n"  // BB coefficient
-      "movi       v21.8b, #68                    \n"  // BG coefficient
-      "movi       v22.8b, #35                    \n"  // BR coefficient
-      "movi       v24.8b, #22                    \n"  // GB coefficient
-      "movi       v25.8b, #88                    \n"  // GG coefficient
-      "movi       v26.8b, #45                    \n"  // GR coefficient
-      "movi       v28.8b, #24                    \n"  // BB coefficient
-      "movi       v29.8b, #98                    \n"  // BG coefficient
-      "movi       v30.8b, #50                    \n"  // BR coefficient
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
-      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
-      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
-      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
-      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
-      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
-      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
-      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
-      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
-      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
-      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
-      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
-      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-      "b.gt       1b                             \n"
-      : "+r"(dst_argb),  // %0
-        "+r"(width)      // %1
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
-}
-
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
-// needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const int8_t* matrix_argb,
-                             int width) {
-  asm volatile(
-      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
-
-      "1:                                        \n"
-      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-      "uxtl       v17.8h, v17.8b                 \n"  // g
-      "uxtl       v18.8h, v18.8b                 \n"  // r
-      "uxtl       v19.8h, v19.8b                 \n"  // a
-      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),   // %0
-        "+r"(dst_argb),   // %1
-        "+r"(width)       // %2
-      : "r"(matrix_argb)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
-}
-
-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
-                     const uint8_t* src_argb1,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v0.8b, v0.8b, v4.8b            \n"
-      "uqadd      v1.8b, v1.8b, v5.8b            \n"
-      "uqadd      v2.8b, v2.8b, v6.8b            \n"
-      "uqadd      v3.8b, v3.8b, v7.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
-                          const uint8_t* src_argb1,
-                          uint8_t* dst_argb,
-                          int width) {
-  asm volatile(
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqsub      v0.8b, v0.8b, v4.8b            \n"
-      "uqsub      v1.8b, v1.8b, v5.8b            \n"
-      "uqsub      v2.8b, v2.8b, v6.8b            \n"
-      "uqsub      v3.8b, v3.8b, v7.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_argb0),  // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-void SobelRow_NEON(const uint8_t* src_sobelx,
-                   const uint8_t* src_sobely,
-                   uint8_t* dst_argb,
-                   int width) {
-  asm volatile(
-      "movi       v3.8b, #255                    \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-      "orr        v1.8b, v0.8b, v0.8b            \n"
-      "orr        v2.8b, v0.8b, v0.8b            \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
-                          const uint8_t* src_sobely,
-                          uint8_t* dst_y,
-                          int width) {
-  asm volatile(
-      // 16 pixel loop.
-      "1:                                        \n"
-      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-      "b.gt       1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_y),       // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1");
-}
-
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-void SobelXYRow_NEON(const uint8_t* src_sobelx,
-                     const uint8_t* src_sobely,
-                     uint8_t* dst_argb,
-                     int width) {
-  asm volatile(
-      "movi       v3.8b, #255                    \n"  // alpha
-      // 8 pixel loop.
-      "1:                                        \n"
-      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
-      "b.gt       1b                             \n"
-      : "+r"(src_sobelx),  // %0
-        "+r"(src_sobely),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-void SobelXRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    const uint8_t* src_y2,
-                    uint8_t* dst_sobelx,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.8b}, [%0],%5               \n"  // top
-      "ld1        {v1.8b}, [%0],%6               \n"
-      "usubl      v0.8h, v0.8b, v1.8b            \n"
-      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-      "ld1        {v3.8b}, [%1],%6               \n"
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-      "ld1        {v3.8b}, [%2],%6               \n"
-      "subs       %w4, %w4, #8                   \n"  // 8 pixels
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "abs        v0.8h, v0.8h                   \n"
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-      "b.gt       1b                             \n"
-      : "+r"(src_y0),                           // %0
-        "+r"(src_y1),                           // %1
-        "+r"(src_y2),                           // %2
-        "+r"(dst_sobelx),                       // %3
-        "+r"(width)                             // %4
-      : "r"(2LL),                               // %5
-        "r"(6LL)                                // %6
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-void SobelYRow_NEON(const uint8_t* src_y0,
-                    const uint8_t* src_y1,
-                    uint8_t* dst_sobely,
-                    int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.8b}, [%0],%4               \n"  // left
-      "ld1        {v1.8b}, [%1],%4               \n"
-      "usubl      v0.8h, v0.8b, v1.8b            \n"
-      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-      "ld1        {v3.8b}, [%1],%4               \n"
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "ld1        {v2.8b}, [%0],%5               \n"  // right
-      "ld1        {v3.8b}, [%1],%5               \n"
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels
-      "usubl      v1.8h, v2.8b, v3.8b            \n"
-      "add        v0.8h, v0.8h, v1.8h            \n"
-      "abs        v0.8h, v0.8h                   \n"
-      "uqxtn      v0.8b, v0.8h                   \n"
-      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-      "b.gt       1b                             \n"
-      : "+r"(src_y0),                           // %0
-        "+r"(src_y1),                           // %1
-        "+r"(dst_sobely),                       // %2
-        "+r"(width)                             // %3
-      : "r"(1LL),                               // %4
-        "r"(6LL)                                // %5
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
-      "fcvtn2     v1.8h, v3.4s                   \n"
-      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
-      "b.gt       1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-void HalfFloatRow_NEON(const uint16_t* src,
-                       uint16_t* dst,
-                       float scale,
-                       int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
-      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
-      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
-      "uqshrn2    v1.8h, v3.4s, #13              \n"
-      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
-      "b.gt       1b                             \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
-      : "w"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-void ByteToFloatRow_NEON(const uint8_t* src,
-                         float* dst,
-                         float scale,
-                         int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
-      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
-      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
-      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
-      "uxtl2      v3.4s, v1.8h                   \n"
-      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
-      "scvtf      v3.4s, v3.4s                   \n"
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
-      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
-      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
-      "b.gt       1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "v1", "v2", "v3");
-}
-
-float ScaleMaxSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width) {
-  float fmax;
-  asm volatile(
-      "movi       v5.4s, #0                      \n"  // max
-      "movi       v6.4s, #0                      \n"
-
-      "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
-      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
-      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
-      "fmax       v6.4s, v6.4s, v2.4s            \n"
-      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
-      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
-      : "+r"(src),                                    // %0
-        "+r"(dst),                                    // %1
-        "+r"(width),                                  // %2
-        "=w"(fmax)                                    // %3
-      : "w"(scale)                                    // %4
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-  return fmax;
-}
-
-float ScaleSumSamples_NEON(const float* src,
-                           float* dst,
-                           float scale,
-                           int width) {
-  float fsum;
-  asm volatile(
-      "movi       v5.4s, #0                      \n"  // max
-      "movi       v6.4s, #0                      \n"  // max
-
-      "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
-      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
-      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
-      "fmla       v6.4s, v2.4s, v2.4s            \n"
-      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      "faddp      v5.4s, v5.4s, v6.4s            \n"
-      "faddp      v5.4s, v5.4s, v5.4s            \n"
-      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
-      : "+r"(src),                                    // %0
-        "+r"(dst),                                    // %1
-        "+r"(width),                                  // %2
-        "=w"(fsum)                                    // %3
-      : "w"(scale)                                    // %4
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-  return fsum;
-}
-
-void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
-      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
-      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      : "w"(scale)   // %3
-      : "cc", "memory", "v1", "v2");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussCol_NEON(const uint16_t* src0,
-                   const uint16_t* src1,
-                   const uint16_t* src2,
-                   const uint16_t* src3,
-                   const uint16_t* src4,
-                   uint32_t* dst,
-                   int width) {
-  asm volatile(
-      "movi       v6.8h, #4                      \n"  // constant 4
-      "movi       v7.8h, #6                      \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
-      "ld1        {v2.8h}, [%4], #16             \n"
-      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
-      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
-      "ld1        {v2.8h}, [%1], #16             \n"
-      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
-      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
-      "ld1        {v2.8h}, [%2], #16             \n"
-      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
-      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
-      "ld1        {v2.8h}, [%3], #16             \n"
-      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
-      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
-      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
-      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      : "+r"(src0),  // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(src4),  // %4
-        "+r"(dst),   // %5
-        "+r"(width)  // %6
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
-}
-
-// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
-  const uint32_t* src1 = src + 1;
-  const uint32_t* src2 = src + 2;
-  const uint32_t* src3 = src + 3;
-  asm volatile(
-      "movi       v6.4s, #4                      \n"  // constant 4
-      "movi       v7.4s, #6                      \n"  // constant 6
-
-      "1:                                        \n"
-      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
-      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
-      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
-      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
-      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
-      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
-      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
-      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
-      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
-      "add        v3.4s, v3.4s, v5.4s            \n"
-      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
-      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
-      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
-      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
-      "uqrshrn2   v0.8h, v1.4s, #8               \n"
-      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
-      "b.gt       1b                             \n"
-      : "+r"(src),   // %0
-        "+r"(src1),  // %1
-        "+r"(src2),  // %2
-        "+r"(src3),  // %3
-        "+r"(dst),   // %4
-        "+r"(width)  // %5
-      : "r"(32LL)    // %6
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Convert biplanar NV21 to packed YUV24
-void NV21ToYUV24Row_NEON(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                         uint8_t* dst_yuv24,
-                         int width) {
-  asm volatile(
-      "1:                                          \n"
-      "ld1        {v2.16b}, [%0], #16            \n"     // load 16 Y values
-      "ld2        {v0.8b, v1.8b}, [%1], #16      \n"     // load 8 VU values
-      "zip1       v0.16b, v0.16b, v0.16b         \n"     // replicate V values
-      "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
-      "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
-      "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_y),      // %0
-        "+r"(src_vu),     // %1
-        "+r"(dst_yuv24),  // %2
-        "+r"(width)       // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_uv,
-                      int width) {
-  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
-
-      "1:                                          \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                // pixels.
-      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
-      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
-      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
-      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
-      "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
-      "uqrshrn    v2.8b, v1.8h, #2               \n"
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
-      "b.gt       1b                             \n"
-      : "+r"(src_ayuv),    // %0
-        "+r"(src_ayuv_1),  // %1
-        "+r"(dst_uv),      // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int src_stride_ayuv,
-                      uint8_t* dst_vu,
-                      int width) {
-  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
-
-      "1:                                          \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                // pixels.
-      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
-      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
-      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
-      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
-      "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
-      "uqrshrn    v1.8b, v1.8h, #2               \n"
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
-      "b.gt       1b                             \n"
-      : "+r"(src_ayuv),    // %0
-        "+r"(src_ayuv_1),  // %1
-        "+r"(dst_vu),      // %2
-        "+r"(width)        // %3
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-}
-
-// Copy row of AYUV Y's into Y
-void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
-      "1:                                          \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                // pixels
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-      "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_ayuv),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3");
-}
-
-void FloatDivToByteRow_NEON(const float* src_weights,
-                            const float* src_values,
-                            uint8_t* dst_out,
-                            uint8_t* dst_mask,
-                            int width) {
-  asm volatile(
-      "movi       v0.4s, #0                      \n"
-
-      "1:                                        \n"
-      "ld1        {v1.4s,v2.4s}, [%0], #32       \n"  // load 8 float weights
-      "ld1        {v3.4s,v4.4s}, [%1], #32       \n"  // load 8 float values
-      "subs       %w4, %w4, #8                   \n"  // 8 pixels per loop
-
-      "fdiv       v1.4s, v3.4s, v1.4s            \n"  // values / weights
-      "fdiv       v2.4s, v4.4s, v2.4s            \n"
-
-      "fcvtas     v1.4s, v1.4s                   \n"  // float to int
-      "fcvtas     v2.4s, v2.4s                   \n"  // float to int
-      "uqxtn      v1.4h, v1.4s                   \n"  // 8 shorts
-      "uqxtn2     v1.8h, v2.4s                   \n"
-      "uqxtn      v1.8b, v1.8h                   \n"  // 8 bytes
-
-      "st1        {v1.8b}, [%2], #8              \n"  // store 8 byte out
-
-      "fcmgt      v5.4s, v1.4s, v0.4s            \n"  // cmp weight to zero
-      "fcmgt      v6.4s, v2.4s, v0.4s            \n"
-      "uqxtn      v5.4h, v5.4s                   \n"  // 8 shorts
-      "uqxtn2     v5.8h, v6.4s                   \n"
-      "uqxtn      v5.8b, v1.8h                   \n"  // 8 bytes
-
-      "st1        {v5.8b}, [%3], #8              \n"  // store 8 byte mask
-
-      "b.gt       1b                             \n"
-      : "+r"(src_weights),  // %0
-        "+r"(src_values),   // %1
-        "+r"(dst_out),      // %2
-        "+r"(dst_mask),     // %3
-        "+r"(width)         // %4
-      :
-      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
-}
-
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
-      "1:                                          \n"
-      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
-      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_uv),  // %0
-        "+r"(dst_vu),  // %1
-        "+r"(width)    // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
deleted file mode 100644
index 17831372..00000000
--- a/files/source/scale_any.cc
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>  // For memset/memcpy
-
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
-               int dx) {                                                       \
-    int r = dst_width & MASK;                                                  \
-    int n = dst_width & ~MASK;                                                 \
-    if (n > 0) {                                                               \
-      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
-    }                                                                          \
-    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
-  }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
-     ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C,
-     4,
-     3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
-     ScaleARGBFilterCols_MSA,
-     ScaleARGBFilterCols_C,
-     4,
-     7)
-#endif
-#undef CANY
-
-// Fixed scale down.
-// Mask may be non-power of 2, so use MOD
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
-    int n = dst_width - r;                                                     \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r);                                      \
-  }
-
-// Fixed scale down for odd source width.  Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
-    int n = (dst_width - 1) - r;                                               \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r + 1);                                  \
-  }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3,
-      ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      15)
-SDANY(ScaleRowDown2Box_Any_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN2_AVX2
-SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2,
-      ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      31)
-SDANY(ScaleRowDown2Box_Any_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      31)
-SDODD(ScaleRowDown2Box_Odd_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_NEON
-SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON,
-      ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      15)
-SDANY(ScaleRowDown2Box_Any_NEON,
-      ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      15)
-SDODD(ScaleRowDown2Box_Odd_NEON,
-      ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN2_MSA
-SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_MSA,
-      ScaleRowDown2Linear_MSA,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      31)
-SDANY(ScaleRowDown2Box_Any_MSA,
-      ScaleRowDown2Box_MSA,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      31)
-#endif
-#ifdef HAS_SCALEROWDOWN2_MMI
-SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
-SDANY(ScaleRowDown2Linear_Any_MMI,
-      ScaleRowDown2Linear_MMI,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      7)
-SDANY(ScaleRowDown2Box_Any_MMI,
-      ScaleRowDown2Box_MMI,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      7)
-SDODD(ScaleRowDown2Box_Odd_MMI,
-      ScaleRowDown2Box_MMI,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3,
-      ScaleRowDown4Box_SSSE3,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_AVX2
-SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2,
-      ScaleRowDown4Box_AVX2,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_NEON
-SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON,
-      ScaleRowDown4Box_NEON,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MSA
-SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_MSA,
-      ScaleRowDown4Box_MSA,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      15)
-#endif
-#ifdef HAS_SCALEROWDOWN4_MMI
-SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_MMI,
-      ScaleRowDown4Box_MMI,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-#endif
-#ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3,
-      ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
-      ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
-      ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON,
-      ScaleRowDown34_NEON,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON,
-      ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON,
-      ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      23)
-#endif
-#ifdef HAS_SCALEROWDOWN34_MSA
-SDANY(ScaleRowDown34_Any_MSA,
-      ScaleRowDown34_MSA,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      47)
-SDANY(ScaleRowDown34_0_Box_Any_MSA,
-      ScaleRowDown34_0_Box_MSA,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      47)
-SDANY(ScaleRowDown34_1_Box_Any_MSA,
-      ScaleRowDown34_1_Box_MSA,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      47)
-#endif
-#ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3,
-      ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
-      ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
-      ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      5)
-#endif
-#ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON,
-      ScaleRowDown38_NEON,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON,
-      ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON,
-      ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      11)
-#endif
-#ifdef HAS_SCALEROWDOWN38_MSA
-SDANY(ScaleRowDown38_Any_MSA,
-      ScaleRowDown38_MSA,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_3_Box_Any_MSA,
-      ScaleRowDown38_3_Box_MSA,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      11)
-SDANY(ScaleRowDown38_2_Box_Any_MSA,
-      ScaleRowDown38_2_Box_MSA,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      11)
-#endif
-
-#ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2,
-      ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
-      ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2,
-      ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON,
-      ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON,
-      ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON,
-      ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      7)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MSA
-SDANY(ScaleARGBRowDown2_Any_MSA,
-      ScaleARGBRowDown2_MSA,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Linear_Any_MSA,
-      ScaleARGBRowDown2Linear_MSA,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      3)
-SDANY(ScaleARGBRowDown2Box_Any_MSA,
-      ScaleARGBRowDown2Box_MSA,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MMI
-SDANY(ScaleARGBRowDown2_Any_MMI,
-      ScaleARGBRowDown2_MMI,
-      ScaleARGBRowDown2_C,
-      2,
-      4,
-      1)
-SDANY(ScaleARGBRowDown2Linear_Any_MMI,
-      ScaleARGBRowDown2Linear_MMI,
-      ScaleARGBRowDown2Linear_C,
-      2,
-      4,
-      1)
-SDANY(ScaleARGBRowDown2Box_Any_MMI,
-      ScaleARGBRowDown2Box_MMI,
-      ScaleARGBRowDown2Box_C,
-      2,
-      4,
-      1)
-#endif
-#undef SDANY
-
-// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
-               uint8_t* dst_ptr, int dst_width) {                           \
-    int r = dst_width & MASK;                                               \
-    int n = dst_width & ~MASK;                                              \
-    if (n > 0) {                                                            \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
-    }                                                                       \
-    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
-                   dst_ptr + n * BPP, r);                                   \
-  }
-
-#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2,
-       ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
-       ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON,
-       ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
-       ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
-SDAANY(ScaleARGBRowDownEven_Any_MSA,
-       ScaleARGBRowDownEven_MSA,
-       ScaleARGBRowDownEven_C,
-       4,
-       3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
-       ScaleARGBRowDownEvenBox_MSA,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       3)
-#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
-SDAANY(ScaleARGBRowDownEven_Any_MMI,
-       ScaleARGBRowDownEven_MMI,
-       ScaleARGBRowDownEven_C,
-       4,
-       1)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
-       ScaleARGBRowDownEvenBox_MMI,
-       ScaleARGBRowDownEvenBox_C,
-       4,
-       1)
-#endif
-
-#ifdef SASIMDONLY
-// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
-
-// Add rows box filter scale down.  Using macro from row_any
-#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
-    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
-    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
-    int r = width & MASK;                                              \
-    int n = width & ~MASK;                                             \
-    if (n > 0) {                                                       \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
-    }                                                                  \
-    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
-    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
-    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
-    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
-#endif
-#undef SAANY
-
-#else
-
-// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
-    int n = src_width & ~MASK;                                             \
-    if (n > 0) {                                                           \
-      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
-    }                                                                      \
-    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
-#endif
-#ifdef HAS_SCALEADDROW_NEON
-SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MSA
-SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
-#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
-#endif
-#undef SAANY
-
-#endif  // SASIMDONLY
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_dspr2.cc b/files/source/scale_dspr2.cc
deleted file mode 100644
index ddedcbf4..00000000
--- a/files/source/scale_dspr2.cc
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst,
-                         int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-
-      "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
-      "beqz           $t9, 2f                        \n"
-      " nop                                          \n"
-
-      "1:                                            \n"
-      "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
-      "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
-      "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
-      "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
-      "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
-      "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
-      "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
-      "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
-      // TODO(fbarchard): Use odd pixels instead of even.
-      "precrq.qb.ph   $t8, $t1, $t0                  \n"  // |7|5|3|1|
-      "precrq.qb.ph   $t0, $t3, $t2                  \n"  // |15|13|11|9|
-      "precrq.qb.ph   $t1, $t5, $t4                  \n"  // |23|21|19|17|
-      "precrq.qb.ph   $t2, $t7, $t6                  \n"  // |31|29|27|25|
-      "addiu          %[src_ptr], %[src_ptr], 32     \n"
-      "addiu          $t9, $t9, -1                   \n"
-      "sw             $t8, 0(%[dst])                 \n"
-      "sw             $t0, 4(%[dst])                 \n"
-      "sw             $t1, 8(%[dst])                 \n"
-      "sw             $t2, 12(%[dst])                \n"
-      "bgtz           $t9, 1b                        \n"
-      " addiu         %[dst], %[dst], 16             \n"
-
-      "2:                                            \n"
-      "andi           $t9, %[dst_width], 0xf         \n"  // residue
-      "beqz           $t9, 3f                        \n"
-      " nop                                          \n"
-
-      "21:                                           \n"
-      "lbu            $t0, 1(%[src_ptr])             \n"
-      "addiu          %[src_ptr], %[src_ptr], 2      \n"
-      "addiu          $t9, $t9, -1                   \n"
-      "sb             $t0, 0(%[dst])                 \n"
-      "bgtz           $t9, 21b                       \n"
-      " addiu         %[dst], %[dst], 1              \n"
-
-      "3:                                            \n"
-      ".set pop                                      \n"
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
-      : [dst_width] "r"(dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8* dst,
-                            int dst_width) {
-  const uint8* t = src_ptr + src_stride;
-
-  __asm__ __volatile__(
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
-      "bltz           $t9, 2f                       \n"
-      " nop                                         \n"
-
-      "1:                                           \n"
-      "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
-      "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
-      "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
-      "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
-      "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
-      "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
-      "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
-      "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
-      "addiu          $t9, $t9, -1                  \n"
-      "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
-      "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
-      "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
-      "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
-      "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
-      "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
-      "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
-      "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
-      "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
-      "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
-      "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
-      "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
-      "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
-      "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
-      "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
-      "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
-      "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
-      "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
-      "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
-      "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
-      "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
-      "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
-      "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
-      "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
-      "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
-      "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
-      "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
-      "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
-      "addiu          %[src_ptr], %[src_ptr], 16    \n"
-      "addiu          %[t], %[t], 16                \n"
-      "sb             $t0, 0(%[dst])                \n"
-      "sb             $t4, 1(%[dst])                \n"
-      "sb             $t1, 2(%[dst])                \n"
-      "sb             $t5, 3(%[dst])                \n"
-      "sb             $t2, 4(%[dst])                \n"
-      "sb             $t6, 5(%[dst])                \n"
-      "sb             $t3, 6(%[dst])                \n"
-      "sb             $t7, 7(%[dst])                \n"
-      "bgtz           $t9, 1b                       \n"
-      " addiu         %[dst], %[dst], 8             \n"
-
-      "2:                                           \n"
-      "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
-      "beqz           $t9, 3f                       \n"
-      " nop                                         \n"
-
-      "21:                                          \n"
-      "lwr            $t1, 0(%[src_ptr])            \n"
-      "lwl            $t1, 3(%[src_ptr])            \n"
-      "lwr            $t2, 0(%[t])                  \n"
-      "lwl            $t2, 3(%[t])                  \n"
-      "srl            $t8, $t1, 16                  \n"
-      "ins            $t1, $t2, 16, 16              \n"
-      "ins            $t2, $t8, 0, 16               \n"
-      "raddu.w.qb     $t1, $t1                      \n"
-      "raddu.w.qb     $t2, $t2                      \n"
-      "shra_r.w       $t1, $t1, 2                   \n"
-      "shra_r.w       $t2, $t2, 2                   \n"
-      "sb             $t1, 0(%[dst])                \n"
-      "sb             $t2, 1(%[dst])                \n"
-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
-      "addiu          $t9, $t9, -2                  \n"
-      "addiu          %[t], %[t], 4                 \n"
-      "bgtz           $t9, 21b                      \n"
-      " addiu         %[dst], %[dst], 2             \n"
-
-      "3:                                           \n"
-      ".set pop                                     \n"
-
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
-      : [dst_width] "r"(dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8* dst,
-                         int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "srl            $t9, %[dst_width], 3          \n"
-      "beqz           $t9, 2f                       \n"
-      " nop                                         \n"
-
-      "1:                                           \n"
-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precrq.qb.ph   $t1, $t2, $t1                 \n"  // |14|10|6|2|
-      "precrq.qb.ph   $t5, $t6, $t5                 \n"  // |30|26|22|18|
-      "addiu          %[src_ptr], %[src_ptr], 32    \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sw             $t1, 0(%[dst])                \n"
-      "sw             $t5, 4(%[dst])                \n"
-      "bgtz           $t9, 1b                       \n"
-      " addiu         %[dst], %[dst], 8             \n"
-
-      "2:                                           \n"
-      "andi           $t9, %[dst_width], 7          \n"  // residue
-      "beqz           $t9, 3f                       \n"
-      " nop                                         \n"
-
-      "21:                                          \n"
-      "lbu            $t1, 2(%[src_ptr])            \n"
-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sb             $t1, 0(%[dst])                \n"
-      "bgtz           $t9, 21b                      \n"
-      " addiu         %[dst], %[dst], 1             \n"
-
-      "3:                                           \n"
-      ".set pop                                     \n"
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
-      : [dst_width] "r"(dst_width)
-      : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8* dst,
-                            int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  const uint8* s2 = s1 + stride;
-  const uint8* s3 = s2 + stride;
-
-  __asm__ __volatile__(
-      ".set push                                  \n"
-      ".set noreorder                             \n"
-
-      "srl           $t9, %[dst_width], 1         \n"
-      "andi          $t8, %[dst_width], 1         \n"
-
-      "1:                                         \n"
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "add           $t4, $t4, $t5                \n"
-      "add           $t6, $t6, $t7                \n"
-      "add           $t4, $t4, $t6                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "shra_r.w      $t4, $t4, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-      "sb            $t4, 1(%[dst])               \n"
-      "addiu         %[src_ptr], %[src_ptr], 8    \n"
-      "addiu         %[s1], %[s1], 8              \n"
-      "addiu         %[s2], %[s2], 8              \n"
-      "addiu         %[s3], %[s3], 8              \n"
-      "addiu         $t9, $t9, -1                 \n"
-      "bgtz          $t9, 1b                      \n"
-      " addiu        %[dst], %[dst], 2            \n"
-      "beqz          $t8, 2f                      \n"
-      " nop                                       \n"
-
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-
-      "2:                                         \n"
-      ".set pop                                   \n"
-
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
-        [s3] "+r"(s3)
-      : [dst_width] "r"(dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8* dst,
-                          int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                          \n"
-      ".set noreorder                                     \n"
-      "1:                                                 \n"
-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
-      "addiu           %[dst_width], %[dst_width], -24    \n"
-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
-      "addiu           %[src_ptr], %[src_ptr], 32         \n"
-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
-      "sw              $t1, 0(%[dst])                     \n"
-      "sw              $t0, 4(%[dst])                     \n"
-      "sw              $t3, 8(%[dst])                     \n"
-      "sw              $t5, 12(%[dst])                    \n"
-      "sw              $t9, 16(%[dst])                    \n"
-      "sw              $t7, 20(%[dst])                    \n"
-      "bnez            %[dst_width], 1b                   \n"
-      " addiu          %[dst], %[dst], 24                 \n"
-      ".set pop                                           \n"
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* d,
-                                int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "repl.ph           $t3, 3                          \n"  // 0x00030003
-
-      "1:                                                \n"
-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                        \n"
-      "raddu.w.qb        $t1, $t1                        \n"
-      "shra_r.w          $t0, $t0, 1                     \n"
-      "shra_r.w          $t1, $t1, 1                     \n"
-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
-      "addu.ph           $t2, $t2, $t4                   \n"
-      "addu.ph           $t6, $t6, $t5                   \n"
-      "sll               $t5, $t0, 1                     \n"
-      "add               $t0, $t5, $t0                   \n"
-      "shra_r.ph         $t2, $t2, 2                     \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "shll.ph           $t4, $t2, 1                     \n"
-      "addq.ph           $t4, $t4, $t2                   \n"
-      "addu              $t0, $t0, $t1                   \n"
-      "addiu             %[src_ptr], %[src_ptr], 4       \n"
-      "shra_r.w          $t0, $t0, 2                     \n"
-      "addu.ph           $t6, $t6, $t4                   \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "srl               $t1, $t6, 16                    \n"
-      "addiu             %[dst_width], %[dst_width], -3  \n"
-      "sb                $t1, 0(%[d])                    \n"
-      "sb                $t0, 1(%[d])                    \n"
-      "sb                $t6, 2(%[d])                    \n"
-      "bgtz              %[dst_width], 1b                \n"
-      " addiu            %[d], %[d], 3                   \n"
-      "3:                                                \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
-        [dst_width] "+r"(dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* d,
-                                int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                           \n"
-      ".set noreorder                                      \n"
-      "repl.ph           $t2, 3                            \n"  // 0x00030003
-
-      "1:                                                  \n"
-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                          \n"
-      "raddu.w.qb        $t1, $t1                          \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "shra_r.w          $t1, $t1, 1                       \n"
-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
-      "addu.ph           $t4, $t4, $t3                     \n"
-      "addu.ph           $t6, $t6, $t5                     \n"
-      "shra_r.ph         $t6, $t6, 2                       \n"
-      "shra_r.ph         $t4, $t4, 2                       \n"
-      "addu.ph           $t6, $t6, $t4                     \n"
-      "addiu             %[src_ptr], %[src_ptr], 4         \n"
-      "shra_r.ph         $t6, $t6, 1                       \n"
-      "addu              $t0, $t0, $t1                     \n"
-      "addiu             %[dst_width], %[dst_width], -3    \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "srl               $t1, $t6, 16                      \n"
-      "sb                $t1, 0(%[d])                      \n"
-      "sb                $t0, 1(%[d])                      \n"
-      "sb                $t6, 2(%[d])                      \n"
-      "bgtz              %[dst_width], 1b                  \n"
-      " addiu            %[d], %[d], 3                     \n"
-      "3:                                                  \n"
-      ".set pop                                            \n"
-      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
-        [dst_width] "+r"(dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8* dst,
-                          int dst_width) {
-  __asm__ __volatile__(
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-
-      "1:                                            \n"
-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
-      "addiu      %[src_ptr], %[src_ptr], 32         \n"
-      "addiu      %[dst_width], %[dst_width], -12    \n"
-      "addiu      $t8,%[dst_width], -12              \n"
-      "sw         $t1, 0(%[dst])                     \n"
-      "sw         $t4, 4(%[dst])                     \n"
-      "sw         $t6, 8(%[dst])                     \n"
-      "bgez       $t8, 1b                            \n"
-      " addiu     %[dst], %[dst], 12                 \n"
-      ".set pop                                      \n"
-      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr,
-                                int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* t = src_ptr + stride;
-  const int c = 0x2AAA;
-
-  __asm__ __volatile__(
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-      "1:                                                \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
-      "srl             $t4, $t4, 2                       \n"  // t4 / 4
-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
-      "addu            $t6, $t5, $t6                     \n"
-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
-      "addu            $t0, $t0, $t2                     \n"
-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[t], %[t], 8                     \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t4, -1(%[dst_ptr])               \n"
-      "sb              $t6, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
-        [dst_width] "+r"(dst_width)
-      : [c] "r"(c)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr,
-                                int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  stride += stride;
-  const uint8* s2 = src_ptr + stride;
-  const int c1 = 0x1C71;
-  const int c2 = 0x2AAA;
-
-  __asm__ __volatile__(
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-      "1:                                                \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
-      "addu            $t7, $t7, $t8                     \n"
-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
-      "addu            $t6, $t6, $t8                     \n"
-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
-      "addu            $t7, $t7, $t8                     \n"
-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
-      "raddu.w.qb      $t0, $t0                          \n"
-      "raddu.w.qb      $t2, $t2                          \n"
-      "raddu.w.qb      $t4, $t4                          \n"
-      "addu            $t0, $t0, $t2                     \n"
-      "addu            $t0, $t0, $t4                     \n"
-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[s1], %[s1], 8                   \n"
-      "addiu           %[s2], %[s2], 8                   \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t7, $t7, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t6, -1(%[dst_ptr])               \n"
-      "sb              $t7, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
-        [s2] "+r"(s2), [dst_width] "+r"(dst_width)
-      : [c1] "r"(c1), [c2] "r"(c2)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  int x;
-  for (x = 0; x < ((src_width - 1)); x += 8) {
-    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
-    uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
-    __asm__ __volatile__(
-        ".set push                                                \n"
-        ".set noreorder                                           \n"
-        "lw                %[tmp_t5],   0(%[src_ptr])             \n"
-        "lw                %[tmp_t6],   4(%[src_ptr])             \n"
-        "lw                %[tmp_t1],   0(%[dst_ptr])             \n"
-        "lw                %[tmp_t2],   4(%[dst_ptr])             \n"
-        "lw                %[tmp_t3],   8(%[dst_ptr])             \n"
-        "lw                %[tmp_t4],   12(%[dst_ptr])            \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t5]                 \n"
-        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t5]                 \n"
-        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t7]  \n"
-        "addu.ph           %[tmp_t2],   %[tmp_t2],     %[tmp_t8]  \n"
-        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t6]                 \n"
-        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t6]                 \n"
-        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t7]  \n"
-        "addu.ph           %[tmp_t4],   %[tmp_t4],     %[tmp_t8]  \n"
-        "sw                %[tmp_t1],   0(%[dst_ptr])             \n"
-        "sw                %[tmp_t2],   4(%[dst_ptr])             \n"
-        "sw                %[tmp_t3],   8(%[dst_ptr])             \n"
-        "sw                %[tmp_t4],   12(%[dst_ptr])            \n"
-        ".set pop                                                 \n"
-        :
-        [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
-        [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
-        [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
-        : [dst_ptr] "r"(dst_ptr));
-    src_ptr += 8;
-    dst_ptr += 8;
-  }
-
-  if ((src_width)&7) {
-    for (x = 0; x < ((src_width - 1) & 7); x += 1) {
-      dst_ptr[0] += src_ptr[0];
-      src_ptr += 1;
-      dst_ptr += 1;
-    }
-  }
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
deleted file mode 100644
index 90a49f30..00000000
--- a/files/source/scale_gcc.cc
+++ /dev/null
@@ -1,1374 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
-                              8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
-                              10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
-                               128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
-                               6,   8,   11,  14,  128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
-                              128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
-                               6,   7,   12,  13,  128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
-                                  65536 / 9, 65536 / 6, 0,         0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
-                               11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
-                               12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
-                               13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
-                                 65536 / 3, 65536 / 2, 0,         0};
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "psrlw     $0x8,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "pxor       %%xmm5,%%xmm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pavgw      %%xmm5,%%xmm0                  \n"
-      "pavgw      %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "pxor       %%xmm5,%%xmm5                  \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "psrlw      $0x1,%%xmm0                    \n"
-      "psrlw      $0x1,%%xmm1                    \n"
-      "pavgw      %%xmm5,%%xmm0                  \n"
-      "pavgw      %%xmm5,%%xmm1                  \n"
-      "packuswb   %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "lea        0x20(%1),%1                    \n"
-      "sub        $0x20,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "pcmpeqb   %%xmm5,%%xmm5                   \n"
-      "psrld     $0x18,%%xmm5                    \n"
-      "pslld     $0x10,%%xmm5                    \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pand      %%xmm5,%%xmm0                   \n"
-      "pand      %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm0                   \n"
-      "psrlw     $0x8,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_ptr,
-                            int dst_width) {
-  intptr_t stridex3;
-  asm volatile(
-      "pcmpeqb    %%xmm4,%%xmm4                  \n"
-      "psrlw      $0xf,%%xmm4                    \n"
-      "movdqa     %%xmm4,%%xmm5                  \n"
-      "packuswb   %%xmm4,%%xmm4                  \n"
-      "psllw      $0x3,%%xmm5                    \n"
-      "lea       0x00(%4,%4,2),%3                \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-      "pmaddubsw  %%xmm4,%%xmm0                  \n"
-      "pmaddubsw  %%xmm4,%%xmm1                  \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
-      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pmaddubsw  %%xmm4,%%xmm2                  \n"
-      "pmaddubsw  %%xmm4,%%xmm3                  \n"
-      "paddw      %%xmm2,%%xmm0                  \n"
-      "paddw      %%xmm3,%%xmm1                  \n"
-      "phaddw     %%xmm1,%%xmm0                  \n"
-      "paddw      %%xmm5,%%xmm0                  \n"
-      "psrlw      $0x4,%%xmm0                    \n"
-      "packuswb   %%xmm0,%%xmm0                  \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "lea       0x8(%1),%1                      \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width),             // %2
-        "=&r"(stridex3)              // %3
-      : "r"((intptr_t)(src_stride))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm0                    \n"
-      "vmovdqu    0x20(%0),%%ymm1                \n"
-      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
-      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
-      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
-      "lea        0x40(%0),%0                    \n"
-      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-      "vmovdqu    %%xmm0,(%1)                    \n"
-      "lea        0x10(%1),%1                    \n"
-      "sub        $0x10,%2                       \n"
-      "jg         1b                             \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),                   // %0
-        "+r"(dst_ptr),                   // %1
-        "+r"(dst_width)                  // %2
-      : "r"((intptr_t)(src_stride)),     // %3
-        "r"((intptr_t)(src_stride * 3))  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "movdqa    %0,%%xmm3                       \n"
-      "movdqa    %1,%%xmm4                       \n"
-      "movdqa    %2,%%xmm5                       \n"
-      :
-      : "m"(kShuf0),  // %0
-        "m"(kShuf1),  // %1
-        "m"(kShuf2)   // %2
-  );
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm2                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "palignr   $0x8,%%xmm0,%%xmm1              \n"
-      "pshufb    %%xmm3,%%xmm0                   \n"
-      "pshufb    %%xmm4,%%xmm1                   \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movq      %%xmm1,0x8(%1)                  \n"
-      "movq      %%xmm2,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa    %0,%%xmm2                       \n"  // kShuf01
-      "movdqa    %1,%%xmm3                       \n"  // kShuf11
-      "movdqa    %2,%%xmm4                       \n"  // kShuf21
-      :
-      : "m"(kShuf01),  // %0
-        "m"(kShuf11),  // %1
-        "m"(kShuf21)   // %2
-  );
-  asm volatile(
-      "movdqa    %0,%%xmm5                       \n"  // kMadd01
-      "movdqa    %1,%%xmm0                       \n"  // kMadd11
-      "movdqa    %2,%%xmm1                       \n"  // kRound34
-      :
-      : "m"(kMadd01),  // %0
-        "m"(kMadd11),  // %1
-        "m"(kRound34)  // %2
-  );
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,(%1)                     \n"
-      "movdqu    0x8(%0),%%xmm6                  \n"
-      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm0,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x8(%1)                  \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm4,%%xmm6                   \n"
-      "pmaddubsw %4,%%xmm6                       \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kMadd21)                  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa    %0,%%xmm2                       \n"  // kShuf01
-      "movdqa    %1,%%xmm3                       \n"  // kShuf11
-      "movdqa    %2,%%xmm4                       \n"  // kShuf21
-      :
-      : "m"(kShuf01),  // %0
-        "m"(kShuf11),  // %1
-        "m"(kShuf21)   // %2
-  );
-  asm volatile(
-      "movdqa    %0,%%xmm5                       \n"  // kMadd01
-      "movdqa    %1,%%xmm0                       \n"  // kMadd11
-      "movdqa    %2,%%xmm1                       \n"  // kRound34
-      :
-      : "m"(kMadd01),  // %0
-        "m"(kMadd11),  // %1
-        "m"(kRound34)  // %2
-  );
-
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm6                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "pmaddubsw %%xmm5,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,(%1)                     \n"
-      "movdqu    0x8(%0),%%xmm6                  \n"
-      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "pmaddubsw %%xmm0,%%xmm6                   \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x8(%1)                  \n"
-      "movdqu    0x10(%0),%%xmm6                 \n"
-      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm6,%%xmm7                   \n"
-      "pavgb     %%xmm7,%%xmm6                   \n"
-      "pshufb    %%xmm4,%%xmm6                   \n"
-      "pmaddubsw %4,%%xmm6                       \n"
-      "paddsw    %%xmm1,%%xmm6                   \n"
-      "psrlw     $0x2,%%xmm6                     \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movq      %%xmm6,0x10(%1)                 \n"
-      "lea       0x18(%1),%1                     \n"
-      "sub       $0x18,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kMadd21)                  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst_ptr,
-                          int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "movdqa    %3,%%xmm4                       \n"
-      "movdqa    %4,%%xmm5                       \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "paddusb   %%xmm1,%%xmm0                   \n"
-      "movq      %%xmm0,(%1)                     \n"
-      "movhlps   %%xmm0,%%xmm1                   \n"
-      "movd      %%xmm1,0x8(%1)                  \n"
-      "lea       0xc(%1),%1                      \n"
-      "sub       $0xc,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "m"(kShuf38a),   // %3
-        "m"(kShuf38b)    // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa    %0,%%xmm2                       \n"
-      "movdqa    %1,%%xmm3                       \n"
-      "movdqa    %2,%%xmm4                       \n"
-      "movdqa    %3,%%xmm5                       \n"
-      :
-      : "m"(kShufAb0),  // %0
-        "m"(kShufAb1),  // %1
-        "m"(kShufAb2),  // %2
-        "m"(kScaleAb2)  // %3
-  );
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
-      "lea       0x10(%0),%0                     \n"
-      "pavgb     %%xmm1,%%xmm0                   \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "pshufb    %%xmm2,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "pshufb    %%xmm3,%%xmm6                   \n"
-      "paddusw   %%xmm6,%%xmm1                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "paddusw   %%xmm0,%%xmm1                   \n"
-      "pmulhuw   %%xmm5,%%xmm1                   \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movd      %%xmm1,(%1)                     \n"
-      "psrlq     $0x10,%%xmm1                    \n"
-      "movd      %%xmm1,0x2(%1)                  \n"
-      "lea       0x6(%1),%1                      \n"
-      "sub       $0x6,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t* dst_ptr,
-                                int dst_width) {
-  asm volatile(
-      "movdqa    %0,%%xmm2                       \n"
-      "movdqa    %1,%%xmm3                       \n"
-      "movdqa    %2,%%xmm4                       \n"
-      "pxor      %%xmm5,%%xmm5                   \n"
-      :
-      : "m"(kShufAc),    // %0
-        "m"(kShufAc3),   // %1
-        "m"(kScaleAc33)  // %2
-  );
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
-      "movhlps   %%xmm0,%%xmm1                   \n"
-      "movhlps   %%xmm6,%%xmm7                   \n"
-      "punpcklbw %%xmm5,%%xmm0                   \n"
-      "punpcklbw %%xmm5,%%xmm1                   \n"
-      "punpcklbw %%xmm5,%%xmm6                   \n"
-      "punpcklbw %%xmm5,%%xmm7                   \n"
-      "paddusw   %%xmm6,%%xmm0                   \n"
-      "paddusw   %%xmm7,%%xmm1                   \n"
-      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
-      "lea       0x10(%0),%0                     \n"
-      "movhlps   %%xmm6,%%xmm7                   \n"
-      "punpcklbw %%xmm5,%%xmm6                   \n"
-      "punpcklbw %%xmm5,%%xmm7                   \n"
-      "paddusw   %%xmm6,%%xmm0                   \n"
-      "paddusw   %%xmm7,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm6                   \n"
-      "psrldq    $0x2,%%xmm0                     \n"
-      "paddusw   %%xmm0,%%xmm6                   \n"
-      "psrldq    $0x2,%%xmm0                     \n"
-      "paddusw   %%xmm0,%%xmm6                   \n"
-      "pshufb    %%xmm2,%%xmm6                   \n"
-      "movdqa    %%xmm1,%%xmm7                   \n"
-      "psrldq    $0x2,%%xmm1                     \n"
-      "paddusw   %%xmm1,%%xmm7                   \n"
-      "psrldq    $0x2,%%xmm1                     \n"
-      "paddusw   %%xmm1,%%xmm7                   \n"
-      "pshufb    %%xmm3,%%xmm7                   \n"
-      "paddusw   %%xmm7,%%xmm6                   \n"
-      "pmulhuw   %%xmm4,%%xmm6                   \n"
-      "packuswb  %%xmm6,%%xmm6                   \n"
-      "movd      %%xmm6,(%1)                     \n"
-      "psrlq     $0x10,%%xmm6                    \n"
-      "movd      %%xmm6,0x2(%1)                  \n"
-      "lea       0x6(%1),%1                      \n"
-      "sub       $0x6,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-
-      "pxor      %%xmm5,%%xmm5                   \n"
-
-      // 16 pixel loop.
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm3                     \n"
-      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
-      "movdqu    (%1),%%xmm0                     \n"
-      "movdqu    0x10(%1),%%xmm1                 \n"
-      "movdqa    %%xmm3,%%xmm2                   \n"
-      "punpcklbw %%xmm5,%%xmm2                   \n"
-      "punpckhbw %%xmm5,%%xmm3                   \n"
-      "paddusw   %%xmm2,%%xmm0                   \n"
-      "paddusw   %%xmm3,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "movdqu    %%xmm1,0x10(%1)                 \n"
-      "lea       0x20(%1),%1                     \n"
-      "sub       $0x10,%2                        \n"
-      "jg        1b                              \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-
-      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "vmovdqu    (%0),%%ymm3                    \n"
-      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
-      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
-      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
-      "vmovdqu    %%ymm0,(%1)                    \n"
-      "vmovdqu    %%ymm1,0x20(%1)                \n"
-      "lea       0x40(%1),%1                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-      "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
-                               0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx) {
-  intptr_t x0, x1, temp_pixel;
-  asm volatile(
-      "movd      %6,%%xmm2                       \n"
-      "movd      %7,%%xmm3                       \n"
-      "movl      $0x04040000,%k2                 \n"
-      "movd      %k2,%%xmm5                      \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
-      "pcmpeqb   %%xmm7,%%xmm7                   \n"
-      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
-
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "subl      $0x2,%5                         \n"
-      "jl        29f                             \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "punpckldq %%xmm0,%%xmm2                   \n"
-      "punpckldq %%xmm3,%%xmm3                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "movzwl    0x00(%1,%3,1),%k2               \n"
-      "movd      %k2,%%xmm0                      \n"
-      "psrlw     $0x9,%%xmm1                     \n"
-      "movzwl    0x00(%1,%4,1),%k2               \n"
-      "movd      %k2,%%xmm4                      \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "punpcklwd %%xmm4,%%xmm0                   \n"
-      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
-                                                      // 1
-      "paddusb   %%xmm7,%%xmm1                   \n"
-      "pmaddubsw %%xmm0,%%xmm1                   \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
-      "psrlw     $0x7,%%xmm1                     \n"
-      "packuswb  %%xmm1,%%xmm1                   \n"
-      "movd      %%xmm1,%k2                      \n"
-      "mov       %w2,(%0)                        \n"
-      "lea       0x2(%0),%0                      \n"
-      "subl      $0x2,%5                         \n"
-      "jge       2b                              \n"
-
-      LABELALIGN
-      "29:                                       \n"
-      "addl      $0x1,%5                         \n"
-      "jl        99f                             \n"
-      "movzwl    0x00(%1,%3,1),%k2               \n"
-      "movd      %k2,%%xmm0                      \n"
-      "psrlw     $0x9,%%xmm2                     \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-      "pxor      %%xmm6,%%xmm2                   \n"
-      "paddusb   %%xmm7,%%xmm2                   \n"
-      "pmaddubsw %%xmm0,%%xmm2                   \n"
-      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
-      "psrlw     $0x7,%%xmm2                     \n"
-      "packuswb  %%xmm2,%%xmm2                   \n"
-      "movd      %%xmm2,%k2                      \n"
-      "mov       %b2,(%0)                        \n"
-      "99:                                       \n"
-      : "+r"(dst_ptr),      // %0
-        "+r"(src_ptr),      // %1
-        "=&a"(temp_pixel),  // %2
-        "=&r"(x0),          // %3
-        "=&r"(x1),          // %4
-#if defined(__x86_64__)
-        "+rm"(dst_width)  // %5
-#else
-        "+m"(dst_width)  // %5
-#endif
-      : "rm"(x),   // %6
-        "rm"(dx),  // %7
-#if defined(__x86_64__)
-        "x"(kFsub80),  // %8
-        "x"(kFadd40)   // %9
-#else
-        "m"(kFsub80),    // %8
-        "m"(kFadd40)     // %9
-#endif
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
-                       const uint8_t* src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  (void)x;
-  (void)dx;
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpcklbw %%xmm0,%%xmm0                   \n"
-      "punpckhbw %%xmm1,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x20,%2                        \n"
-      "jg        1b                              \n"
-
-      : "+r"(dst_ptr),   // %0
-        "+r"(src_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst_argb,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%0),%%xmm0                     \n"
-      "movdqu    0x10(%0),%%xmm1                 \n"
-      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-      "lea       0x20(%0),%0                     \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%1)                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "sub       $0x4,%2                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),              // %0
-        "+r"(dst_argb),              // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12;
-  (void)src_stride;
-  asm volatile(
-      "lea       0x00(,%1,4),%1                  \n"
-      "lea       0x00(%1,%1,2),%4                \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movd      (%0),%%xmm0                     \n"
-      "movd      0x00(%0,%1,1),%%xmm1            \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movd      0x00(%0,%1,2),%%xmm2            \n"
-      "movd      0x00(%0,%4,1),%%xmm3            \n"
-      "lea       0x00(%0,%1,4),%0                \n"
-      "punpckldq %%xmm3,%%xmm2                   \n"
-      "punpcklqdq %%xmm2,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),       // %0
-        "+r"(src_stepx_x4),   // %1
-        "+r"(dst_argb),       // %2
-        "+r"(dst_width),      // %3
-        "=&r"(src_stepx_x12)  // %4
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
-  intptr_t src_stepx_x12;
-  intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile(
-      "lea       0x00(,%1,4),%1                  \n"
-      "lea       0x00(%1,%1,2),%4                \n"
-      "lea       0x00(%0,%5,1),%5                \n"
-
-      LABELALIGN
-      "1:                                        \n"
-      "movq      (%0),%%xmm0                     \n"
-      "movhps    0x00(%0,%1,1),%%xmm0            \n"
-      "movq      0x00(%0,%1,2),%%xmm1            \n"
-      "movhps    0x00(%0,%4,1),%%xmm1            \n"
-      "lea       0x00(%0,%1,4),%0                \n"
-      "movq      (%5),%%xmm2                     \n"
-      "movhps    0x00(%5,%1,1),%%xmm2            \n"
-      "movq      0x00(%5,%1,2),%%xmm3            \n"
-      "movhps    0x00(%5,%4,1),%%xmm3            \n"
-      "lea       0x00(%5,%1,4),%5                \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "pavgb     %%xmm3,%%xmm1                   \n"
-      "movdqa    %%xmm0,%%xmm2                   \n"
-      "shufps    $0x88,%%xmm1,%%xmm0             \n"
-      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-      "pavgb     %%xmm2,%%xmm0                   \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%3                         \n"
-      "jg        1b                              \n"
-      : "+r"(src_argb),        // %0
-        "+r"(src_stepx_x4),    // %1
-        "+r"(dst_argb),        // %2
-        "+rm"(dst_width),      // %3
-        "=&r"(src_stepx_x12),  // %4
-        "+r"(row1)             // %5
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
-}
-
-void ScaleARGBCols_SSE2(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  intptr_t x0, x1;
-  asm volatile(
-      "movd      %5,%%xmm2                       \n"
-      "movd      %6,%%xmm3                       \n"
-      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-      "paddd     %%xmm0,%%xmm2                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-      "pextrw    $0x1,%%xmm2,%k0                 \n"
-      "pextrw    $0x3,%%xmm2,%k1                 \n"
-      "cmp       $0x0,%4                         \n"
-      "jl        99f                             \n"
-      "sub       $0x4,%4                         \n"
-      "jl        49f                             \n"
-
-      LABELALIGN
-      "40:                                       \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      0x00(%3,%1,4),%%xmm1            \n"
-      "pextrw    $0x5,%%xmm2,%k0                 \n"
-      "pextrw    $0x7,%%xmm2,%k1                 \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movd      0x00(%3,%0,4),%%xmm1            \n"
-      "movd      0x00(%3,%1,4),%%xmm4            \n"
-      "pextrw    $0x1,%%xmm2,%k0                 \n"
-      "pextrw    $0x3,%%xmm2,%k1                 \n"
-      "punpckldq %%xmm4,%%xmm1                   \n"
-      "punpcklqdq %%xmm1,%%xmm0                  \n"
-      "movdqu    %%xmm0,(%2)                     \n"
-      "lea       0x10(%2),%2                     \n"
-      "sub       $0x4,%4                         \n"
-      "jge       40b                             \n"
-
-      "49:                                       \n"
-      "test      $0x2,%4                         \n"
-      "je        29f                             \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      0x00(%3,%1,4),%%xmm1            \n"
-      "pextrw    $0x5,%%xmm2,%k0                 \n"
-      "punpckldq %%xmm1,%%xmm0                   \n"
-      "movq      %%xmm0,(%2)                     \n"
-      "lea       0x8(%2),%2                      \n"
-      "29:                                       \n"
-      "test      $0x1,%4                         \n"
-      "je        99f                             \n"
-      "movd      0x00(%3,%0,4),%%xmm0            \n"
-      "movd      %%xmm0,(%2)                     \n"
-      "99:                                       \n"
-      : "=&a"(x0),       // %0
-        "=&d"(x1),       // %1
-        "+r"(dst_argb),  // %2
-        "+r"(src_argb),  // %3
-        "+r"(dst_width)  // %4
-      : "rm"(x),         // %5
-        "rm"(dx)         // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
-                           const uint8_t* src_argb,
-                           int dst_width,
-                           int x,
-                           int dx) {
-  (void)x;
-  (void)dx;
-  asm volatile(
-
-      LABELALIGN
-      "1:                                        \n"
-      "movdqu    (%1),%%xmm0                     \n"
-      "lea       0x10(%1),%1                     \n"
-      "movdqa    %%xmm0,%%xmm1                   \n"
-      "punpckldq %%xmm0,%%xmm0                   \n"
-      "punpckhdq %%xmm1,%%xmm1                   \n"
-      "movdqu    %%xmm0,(%0)                     \n"
-      "movdqu    %%xmm1,0x10(%0)                 \n"
-      "lea       0x20(%0),%0                     \n"
-      "sub       $0x8,%2                         \n"
-      "jg        1b                              \n"
-
-      : "+r"(dst_argb),  // %0
-        "+r"(src_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
-}
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
-    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
-    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
-    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
-                               const uint8_t* src_argb,
-                               int dst_width,
-                               int x,
-                               int dx) {
-  intptr_t x0, x1;
-  asm volatile(
-      "movdqa    %0,%%xmm4                       \n"
-      "movdqa    %1,%%xmm5                       \n"
-      :
-      : "m"(kShuffleColARGB),   // %0
-        "m"(kShuffleFractions)  // %1
-  );
-
-  asm volatile(
-      "movd      %5,%%xmm2                       \n"
-      "movd      %6,%%xmm3                       \n"
-      "pcmpeqb   %%xmm6,%%xmm6                   \n"
-      "psrlw     $0x9,%%xmm6                     \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "sub       $0x2,%2                         \n"
-      "jl        29f                             \n"
-      "movdqa    %%xmm2,%%xmm0                   \n"
-      "paddd     %%xmm3,%%xmm0                   \n"
-      "punpckldq %%xmm0,%%xmm2                   \n"
-      "punpckldq %%xmm3,%%xmm3                   \n"
-      "paddd     %%xmm3,%%xmm3                   \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-      LABELALIGN
-      "2:                                        \n"
-      "movdqa    %%xmm2,%%xmm1                   \n"
-      "paddd     %%xmm3,%%xmm2                   \n"
-      "movq      0x00(%1,%3,4),%%xmm0            \n"
-      "psrlw     $0x9,%%xmm1                     \n"
-      "movhps    0x00(%1,%4,4),%%xmm0            \n"
-      "pshufb    %%xmm5,%%xmm1                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pxor      %%xmm6,%%xmm1                   \n"
-      "pmaddubsw %%xmm1,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "pextrw    $0x1,%%xmm2,%k3                 \n"
-      "pextrw    $0x3,%%xmm2,%k4                 \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movq      %%xmm0,(%0)                     \n"
-      "lea       0x8(%0),%0                      \n"
-      "sub       $0x2,%2                         \n"
-      "jge       2b                              \n"
-
-      LABELALIGN
-      "29:                                       \n"
-      "add       $0x1,%2                         \n"
-      "jl        99f                             \n"
-      "psrlw     $0x9,%%xmm2                     \n"
-      "movq      0x00(%1,%3,4),%%xmm0            \n"
-      "pshufb    %%xmm5,%%xmm2                   \n"
-      "pshufb    %%xmm4,%%xmm0                   \n"
-      "pxor      %%xmm6,%%xmm2                   \n"
-      "pmaddubsw %%xmm2,%%xmm0                   \n"
-      "psrlw     $0x7,%%xmm0                     \n"
-      "packuswb  %%xmm0,%%xmm0                   \n"
-      "movd      %%xmm0,(%0)                     \n"
-
-      LABELALIGN "99:                            \n"  // clang-format error.
-
-      : "+r"(dst_argb),    // %0
-        "+r"(src_argb),    // %1
-        "+rm"(dst_width),  // %2
-        "=&r"(x0),         // %3
-        "=&r"(x1)          // %4
-      : "rm"(x),           // %5
-        "rm"(dx)           // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
-      "shld      $0x10,%%eax,%%edx               \n"
-      "shl       $0x10,%%eax                     \n"
-      "idiv      %1                              \n"
-      "mov       %0, %%eax                       \n"
-      : "+a"(num)  // %0
-      : "c"(div)   // %1
-      : "memory", "cc", "edx");
-  return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
-      "shld      $0x10,%%eax,%%edx               \n"
-      "shl       $0x10,%%eax                     \n"
-      "sub       $0x10001,%%eax                  \n"
-      "sbb       $0x0,%%edx                      \n"
-      "sub       $0x1,%1                         \n"
-      "idiv      %1                              \n"
-      "mov       %0, %%eax                       \n"
-      : "+a"(num)  // %0
-      : "c"(div)   // %1
-      : "memory", "cc", "edx");
-  return num;
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
deleted file mode 100644
index 990463c2..00000000
--- a/files/source/scale_mmi.cc
+++ /dev/null
@@ -1,1113 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
-      "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
-      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
-      "and        %[dest1],         %[src1],          %[mask]       \n\t"
-      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
-
-      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
-      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
-      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
-
-      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x08         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, t0, t1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift0 = 0x2ULL;
-  const uint64_t shift1 = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
-
-      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  (void)src_stride;
-
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-
-  uint64_t s0, s_hi, s_lo;
-  uint64_t t0, t_hi, t_lo;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shfit = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
-        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
-      : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x10ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, s_hi, s_lo;
-  uint64_t t0, t1, t_hi, t_lo;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0000000200000002ULL;
-  const uint64_t mask = 0x0000ffff0000ffffULL;
-  const uint64_t shift0 = 0x10ULL;
-  const uint64_t shift1 = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
-
-      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[s],             %[s],              0x10         \n\t"
-      "daddiu     %[t],             %[t],              0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x04         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
-        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t shift = 0x10ULL;
-  const uint64_t mask = 0x000000ff000000ffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift), [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
-  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
-  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
-  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
-  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
-  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
-  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
-  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
-                                                                     \
-  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* src0_ptr = src_ptr;
-  const uint8_t* src1_ptr = src_ptr + src_stride;
-  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x0001000100010001ULL;
-  const uint64_t ph = 0x0008000800080008ULL;
-  const uint64_t shift = 0x4ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
-  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
-  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
-  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
-  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
-  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
-                                                                      \
-  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* src0_ptr = src_ptr;
-  const uint16_t* src1_ptr = src_ptr + src_stride;
-  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x00000000ffffffffULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 0x04ULL;
-
-  __asm__ volatile(
-      "1:                                                        \n\t"
-
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
-      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
-      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
-      "daddi      %[width],        %[width],         -0x04       \n\t"
-      "bnez       %[width],        1b                            \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
-
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
-      "daddi      %[width],         %[width],        -0x02          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
-        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  const uint8_t* src0_ptr = src_argb;
-  const uint8_t* src1_ptr = src_argb + src_stride;
-
-  uint64_t src0, src1, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shift = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
-      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
-      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
-
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
-        [ph] "f"(ph)
-      : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  const uint32_t* src_tmp;
-
-  uint64_t dest, offset;
-
-  const uint64_t shift0 = 16;
-  const uint64_t shift1 = 2;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "srav       %[offset],        %[x],             %[shift0]     \n\t"
-      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
-      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
-      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  uint64_t src, dest0, dest1;
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
-      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x04          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-  const int val1 = 1;
-  const int64_t val11 = 0x00010001ULL;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "dsub    %[num],     %[num],     %[val11]    \n\t"
-      "dsub    %[div],     %[div],     %[val1]     \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
-        [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  const uint16_t* src2_ptr = src_ptr + src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest04, dest15, dest26, dest37;
-  uint64_t tmp0, tmp1, tmp2, tmp3;
-
-  const uint64_t mask0 = 0x0003000900030009ULL;
-  const uint64_t mask1 = 0x0001000300010003ULL;
-  const uint64_t mask2 = 0x0009000300090003ULL;
-  const uint64_t mask3 = 0x0003000100030001ULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 4;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
-      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
-      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
-      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
-      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
-
-      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
-      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
-      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
-      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
-      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
-
-      /* tmp0 = ( 00 04 02 06 ) */
-      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
-      /* tmp1 = ( 01 05 03 07 ) */
-      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
-
-      /* tmp2 = ( 00 01 04 05 )*/
-      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
-      /* tmp3 = ( 02 03 06 07 )*/
-      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
-
-      /* ( 00 01 02 03 ) */
-      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      /* ( 04 05 06 07 ) */
-      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
-      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x08          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
-        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
-      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
-      : "memory");
-}
-
-// clang-format on
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
deleted file mode 100644
index 366b155b..00000000
--- a/files/source/scale_neon.cc
+++ /dev/null
@@ -1,958 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
-    !defined(__aarch64__)
-
-// NEON downscalers with interpolation.
-// Provided by Fritz Koenig
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into q0, odd into q1
-      "vld2.8     {q0, q1}, [%0]!                \n"
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1"  // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
-      "vst1.8     {q0}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1"  // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add        %1, %0                         \n"
-      "1:                                        \n"
-      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-      "subs       %3, %3, #16                    \n"  // 16 processed per loop
-      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-      "vpaddl.u8  q1, q1                         \n"
-      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +
-                                                      // row1
-      "vpadal.u8  q1, q3                         \n"
-      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
-                                                      // pack
-      "vrshrn.u16 d1, q1, #2                     \n"
-      "vst1.8     {q0}, [%2]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vst1.8     {d2}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "q0", "q1", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
-      "1:                                        \n"
-      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4
-      "vld1.8     {q1}, [%3]!                    \n"
-      "vld1.8     {q2}, [%4]!                    \n"
-      "vld1.8     {q3}, [%5]!                    \n"
-      "subs       %2, %2, #4                     \n"
-      "vpaddl.u8  q0, q0                         \n"
-      "vpadal.u8  q0, q1                         \n"
-      "vpadal.u8  q0, q2                         \n"
-      "vpadal.u8  q0, q3                         \n"
-      "vpaddl.u16 q0, q0                         \n"
-      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding
-      "vmovn.u16  d0, q0                         \n"
-      "vst1.32    {d0[0]}, [%1]!                 \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_ptr1),   // %3
-        "+r"(src_ptr2),   // %4
-        "+r"(src_ptr3)    // %5
-      :
-      : "q0", "q1", "q2", "q3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
-      "subs       %2, %2, #24                    \n"
-      "vmov       d2, d3                         \n"  // order d0, d1, d2
-      "vst3.8     {d0, d1, d2}, [%1]!            \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "d0", "d1", "d2", "d3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vmov.u8    d24, #3                        \n"
-      "add        %3, %0                         \n"
-      "1:                                        \n"
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
-      "subs         %2, %2, #24                  \n"
-
-      // filter src line 0 with src line 1
-      // expand chars to shorts to allow for room
-      // when adding lines together
-      "vmovl.u8     q8, d4                       \n"
-      "vmovl.u8     q9, d5                       \n"
-      "vmovl.u8     q10, d6                      \n"
-      "vmovl.u8     q11, d7                      \n"
-
-      // 3 * line_0 + line_1
-      "vmlal.u8     q8, d0, d24                  \n"
-      "vmlal.u8     q9, d1, d24                  \n"
-      "vmlal.u8     q10, d2, d24                 \n"
-      "vmlal.u8     q11, d3, d24                 \n"
-
-      // (3 * line_0 + line_1) >> 2
-      "vqrshrn.u16  d0, q8, #2                   \n"
-      "vqrshrn.u16  d1, q9, #2                   \n"
-      "vqrshrn.u16  d2, q10, #2                  \n"
-      "vqrshrn.u16  d3, q11, #2                  \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "vmovl.u8     q8, d1                       \n"
-      "vmlal.u8     q8, d0, d24                  \n"
-      "vqrshrn.u16  d0, q8, #2                   \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "vrhadd.u8    d1, d1, d2                   \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "vmovl.u8     q8, d2                       \n"
-      "vmlal.u8     q8, d3, d24                  \n"
-      "vqrshrn.u16  d2, q8, #2                   \n"
-
-      "vst3.8       {d0, d1, d2}, [%1]!          \n"
-
-      "bgt          1b                           \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
-        "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vmov.u8    d24, #3                        \n"
-      "add        %3, %0                         \n"
-      "1:                                        \n"
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
-      "subs         %2, %2, #24                  \n"
-      // average src line 0 with src line 1
-      "vrhadd.u8    q0, q0, q2                   \n"
-      "vrhadd.u8    q1, q1, q3                   \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "vmovl.u8     q3, d1                       \n"
-      "vmlal.u8     q3, d0, d24                  \n"
-      "vqrshrn.u16  d0, q3, #2                   \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "vrhadd.u8    d1, d1, d2                   \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "vmovl.u8     q3, d2                       \n"
-      "vmlal.u8     q3, d3, d24                  \n"
-      "vqrshrn.u16  d2, q3, #2                   \n"
-
-      "vst3.8       {d0, d1, d2}, [%1]!          \n"
-      "bgt          1b                           \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
-                              22, 24, 27, 30, 0,  0,  0,  0};
-static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
-                                18, 6, 14, 19, 0,  0,  0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "vld1.8     {q3}, [%3]                     \n"
-      "1:                                        \n"
-      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-      "subs       %2, %2, #12                    \n"
-      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-      "vst1.8     {d4}, [%1]!                    \n"
-      "vst1.32    {d5[0]}, [%1]!                 \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(&kShuf38)    // %3
-      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-
-  asm volatile(
-      "vld1.16    {q13}, [%5]                    \n"
-      "vld1.8     {q14}, [%6]                    \n"
-      "vld1.8     {q15}, [%7]                    \n"
-      "add        %3, %0                         \n"
-      "1:                                        \n"
-
-      // d0 = 00 40 01 41 02 42 03 43
-      // d1 = 10 50 11 51 12 52 13 53
-      // d2 = 20 60 21 61 22 62 23 63
-      // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-      "subs         %2, %2, #12                  \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // d0 = 00 10 01 11 02 12 03 13
-      // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8      d0, d1                       \n"
-      "vtrn.u8      d4, d5                       \n"
-      "vtrn.u8      d16, d17                     \n"
-
-      // d2 = 20 30 21 31 22 32 23 33
-      // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8      d2, d3                       \n"
-      "vtrn.u8      d6, d7                       \n"
-      "vtrn.u8      d18, d19                     \n"
-
-      // d0 = 00+10 01+11 02+12 03+13
-      // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8    q0, q0                       \n"
-      "vpaddl.u8    q2, q2                       \n"
-      "vpaddl.u8    q8, q8                       \n"
-
-      // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8    d3, d3                       \n"
-      "vpaddl.u8    d7, d7                       \n"
-      "vpaddl.u8    d19, d19                     \n"
-
-      // combine source lines
-      "vadd.u16     q0, q2                       \n"
-      "vadd.u16     q0, q8                       \n"
-      "vadd.u16     d4, d3, d7                   \n"
-      "vadd.u16     d4, d19                      \n"
-
-      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-      //             + s[6 + st * 1] + s[7 + st * 1]
-      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-      "vqrdmulh.s16 q2, q2, q13                  \n"
-      "vmovn.u16    d4, q2                       \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8     q1, d2                       \n"
-      "vmovl.u8     q3, d6                       \n"
-      "vmovl.u8     q9, d18                      \n"
-
-      // combine source lines
-      "vadd.u16     q1, q3                       \n"
-      "vadd.u16     q1, q9                       \n"
-
-      // d4 = xx 20 xx 30 xx 22 xx 32
-      // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32     d2, d3                       \n"
-
-      // d4 = xx 20 xx 21 xx 22 xx 23
-      // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16     d2, d3                       \n"
-
-      // 0+1+2, 3+4+5
-      "vadd.u16     q0, q1                       \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "vqrdmulh.s16 q0, q0, q15                  \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-      "vmov.u8      d2, d4                       \n"
-
-      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-      "vst1.8       {d3}, [%1]!                  \n"
-      "vst1.32      {d4[0]}, [%1]!               \n"
-      "bgt          1b                           \n"
-      : "+r"(src_ptr),       // %0
-        "+r"(dst_ptr),       // %1
-        "+r"(dst_width),     // %2
-        "+r"(src_stride),    // %3
-        "+r"(src_ptr1)       // %4
-      : "r"(&kMult38_Div6),  // %5
-        "r"(&kShuf38_2),     // %6
-        "r"(&kMult38_Div9)   // %7
-      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
-        "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "vld1.16    {q13}, [%4]                    \n"
-      "vld1.8     {q14}, [%5]                    \n"
-      "add        %3, %0                         \n"
-      "1:                                        \n"
-
-      // d0 = 00 40 01 41 02 42 03 43
-      // d1 = 10 50 11 51 12 52 13 53
-      // d2 = 20 60 21 61 22 62 23 63
-      // d3 = 30 70 31 71 32 72 33 73
-      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-      "subs         %2, %2, #12                  \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // d0 = 00 10 01 11 02 12 03 13
-      // d1 = 40 50 41 51 42 52 43 53
-      "vtrn.u8      d0, d1                       \n"
-      "vtrn.u8      d4, d5                       \n"
-
-      // d2 = 20 30 21 31 22 32 23 33
-      // d3 = 60 70 61 71 62 72 63 73
-      "vtrn.u8      d2, d3                       \n"
-      "vtrn.u8      d6, d7                       \n"
-
-      // d0 = 00+10 01+11 02+12 03+13
-      // d2 = 40+50 41+51 42+52 43+53
-      "vpaddl.u8    q0, q0                       \n"
-      "vpaddl.u8    q2, q2                       \n"
-
-      // d3 = 60+70 61+71 62+72 63+73
-      "vpaddl.u8    d3, d3                       \n"
-      "vpaddl.u8    d7, d7                       \n"
-
-      // combine source lines
-      "vadd.u16     q0, q2                       \n"
-      "vadd.u16     d4, d3, d7                   \n"
-
-      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "vqrshrn.u16  d4, q2, #2                   \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "vmovl.u8     q1, d2                       \n"
-      "vmovl.u8     q3, d6                       \n"
-
-      // combine source lines
-      "vadd.u16     q1, q3                       \n"
-
-      // d4 = xx 20 xx 30 xx 22 xx 32
-      // d5 = xx 21 xx 31 xx 23 xx 33
-      "vtrn.u32     d2, d3                       \n"
-
-      // d4 = xx 20 xx 21 xx 22 xx 23
-      // d5 = xx 30 xx 31 xx 32 xx 33
-      "vtrn.u16     d2, d3                       \n"
-
-      // 0+1+2, 3+4+5
-      "vadd.u16     q0, q1                       \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "vqrdmulh.s16 q0, q0, q13                  \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-      "vmov.u8      d2, d4                       \n"
-
-      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-      "vst1.8       {d3}, [%1]!                  \n"
-      "vst1.32      {d4[0]}, [%1]!               \n"
-      "bgt          1b                           \n"
-      : "+r"(src_ptr),       // %0
-        "+r"(dst_ptr),       // %1
-        "+r"(dst_width),     // %2
-        "+r"(src_stride)     // %3
-      : "r"(&kMult38_Div6),  // %4
-        "r"(&kShuf38_2)      // %5
-      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
-}
-
-// Add a row of bytes to a row of shorts.  Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-      "1:                                        \n"
-      "vld1.16    {q1, q2}, [%1]                 \n"  // load accumulator
-      "vld1.8     {q0}, [%0]!                    \n"  // load 16 bytes
-      "vaddw.u8   q2, q2, d1                     \n"  // add
-      "vaddw.u8   q1, q1, d0                     \n"
-      "vst1.16    {q1, q2}, [%1]!                \n"  // store accumulator
-      "subs       %2, %2, #16                    \n"  // 16 processed per loop
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                      \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5                     \n" \
-  "add        %3, %3, %4                     \n" \
-  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_ptr;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q1, q1, q0                     \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "vadd.s32   q2, q1, q3                     \n"
-    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
-  "1:                                          \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-    "vmov       q10, q1                        \n"
-    "vmov       q11, q2                        \n"
-    "vuzp.16    q10, q11                       \n"
-    "vmovl.u8   q8, d6                         \n"
-    "vmovl.u8   q9, d7                         \n"
-    "vsubl.s16  q11, d18, d16                  \n"
-    "vsubl.s16  q12, d19, d17                  \n"
-    "vmovl.u16  q13, d20                       \n"
-    "vmovl.u16  q10, d21                       \n"
-    "vmul.s32   q11, q11, q13                  \n"
-    "vmul.s32   q12, q12, q10                  \n"
-    "vrshrn.s32  d18, q11, #16                 \n"
-    "vrshrn.s32  d19, q12, #16                 \n"
-    "vadd.s16   q8, q8, q9                     \n"
-    "vmovn.s16  d6, q8                         \n"
-
-    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
-    "vadd.s32   q1, q1, q0                     \n"
-    "vadd.s32   q2, q2, q0                     \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  asm volatile(
-      "cmp          %4, #0                       \n"
-      "beq          100f                         \n"
-      "add          %2, %1                       \n"
-      "cmp          %4, #64                      \n"
-      "beq          75f                          \n"
-      "cmp          %4, #128                     \n"
-      "beq          50f                          \n"
-      "cmp          %4, #192                     \n"
-      "beq          25f                          \n"
-
-      "vdup.8       d5, %4                       \n"
-      "rsb          %4, #256                     \n"
-      "vdup.8       d4, %4                       \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vmull.u8     q13, d0, d4                  \n"
-      "vmull.u8     q14, d1, d4                  \n"
-      "vmlal.u8     q13, d2, d5                  \n"
-      "vmlal.u8     q14, d3, d5                  \n"
-      "vrshrn.u16   d0, q13, #8                  \n"
-      "vrshrn.u16   d1, q14, #8                  \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          1b                           \n"
-      "b            99f                          \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          25b                          \n"
-      "b            99f                          \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "vld1.8       {q1}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          50b                          \n"
-      "b            99f                          \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "vld1.8       {q1}, [%1]!                  \n"
-      "vld1.8       {q0}, [%2]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vrhadd.u8    q0, q1                       \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          75b                          \n"
-      "b            99f                          \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "vld1.8       {q0}, [%1]!                  \n"
-      "subs         %3, %3, #16                  \n"
-      "vst1.8       {q0}, [%0]!                  \n"
-      "bgt          100b                         \n"
-
-      "99:                                       \n"
-      "vst1.8       {d1[7]}, [%0]                \n"
-      : "+r"(dst_ptr),           // %0
-        "+r"(src_ptr),           // %1
-        "+r"(src_stride),        // %2
-        "+r"(dst_width),         // %3
-        "+r"(source_y_fraction)  // %4
-      :
-      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vmov       q2, q1                         \n"  // load next 8 ARGB
-      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
-//  4a:  3e04        subs  r6, #4
-//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
-//  50:  ef64 21f4   vorr  q9, q10, q10
-//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
-//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
-      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add
-      "vst2.32    {q0, q1}, [%1]!                \n"
-      "bgt       1b                              \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst,
-                               int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
-      "1:                                        \n"
-      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
-      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
-      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
-      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
-      "vrshrn.u16 d1, q1, #2                     \n"
-      "vrshrn.u16 d2, q2, #2                     \n"
-      "vrshrn.u16 d3, q3, #2                     \n"
-      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-      "bgt        1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "mov        r12, %3, lsl #2                \n"
-      "1:                                        \n"
-      "vld1.32    {d0[0]}, [%0], r12             \n"
-      "vld1.32    {d0[1]}, [%0], r12             \n"
-      "vld1.32    {d1[0]}, [%0], r12             \n"
-      "vld1.32    {d1[1]}, [%0], r12             \n"
-      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-      "vst1.8     {q0}, [%1]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stepx)   // %3
-      : "memory", "cc", "r12", "q0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  asm volatile(
-      "mov        r12, %4, lsl #2                \n"
-      "add        %1, %1, %0                     \n"
-      "1:                                        \n"
-      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
-      "vld1.8     {d1}, [%1], r12                \n"
-      "vld1.8     {d2}, [%0], r12                \n"
-      "vld1.8     {d3}, [%1], r12                \n"
-      "vld1.8     {d4}, [%0], r12                \n"
-      "vld1.8     {d5}, [%1], r12                \n"
-      "vld1.8     {d6}, [%0], r12                \n"
-      "vld1.8     {d7}, [%1], r12                \n"
-      "vaddl.u8   q0, d0, d1                     \n"
-      "vaddl.u8   q1, d2, d3                     \n"
-      "vaddl.u8   q2, d4, d5                     \n"
-      "vaddl.u8   q3, d6, d7                     \n"
-      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-      "vst1.8     {q0}, [%2]!                    \n"
-      "bgt        1b                             \n"
-      : "+r"(src_argb),    // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst_argb),    // %2
-        "+r"(dst_width)    // %3
-      : "r"(src_stepx)     // %4
-      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                 \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5, lsl #2             \n" \
-  "add        %3, %3, %4                     \n" \
-  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  int tmp;
-  const uint8_t* src_tmp = src_argb;
-  asm volatile(
-      "1:                                        \n"
-      // clang-format off
-      LOAD1_DATA32_LANE(d0, 0)
-      LOAD1_DATA32_LANE(d0, 1)
-      LOAD1_DATA32_LANE(d1, 0)
-      LOAD1_DATA32_LANE(d1, 1)
-      LOAD1_DATA32_LANE(d2, 0)
-      LOAD1_DATA32_LANE(d2, 1)
-      LOAD1_DATA32_LANE(d3, 0)
-      LOAD1_DATA32_LANE(d3, 1)
-      // clang-format on
-      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-      "subs       %2, %2, #8                     \n"  // 8 processed per loop
-      "bgt        1b                             \n"
-      : "+r"(dst_argb),   // %0
-        "+r"(src_argb),   // %1
-        "+r"(dst_width),  // %2
-        "+r"(x),          // %3
-        "+r"(dx),         // %4
-        "=&r"(tmp),       // %5
-        "+r"(src_tmp)     // %6
-      :
-      : "memory", "cc", "q0", "q1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
-  "lsr        %5, %3, #16                                \n" \
-  "add        %6, %1, %5, lsl #2                         \n" \
-  "add        %3, %3, %4                                 \n" \
-  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
-                              const uint8_t* src_argb,
-                              int dst_width,
-                              int x,
-                              int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_argb;
-  asm volatile (
-    "vdup.32    q0, %3                         \n"  // x
-    "vdup.32    q1, %4                         \n"  // dx
-    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
-    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
-    "vmul.s32   q1, q1, q2                     \n"
-    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
-    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "vadd.s32   q8, q1, q0                     \n"
-  "1:                                          \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(d0, d2, 0)
-    LOAD2_DATA32_LANE(d0, d2, 1)
-    LOAD2_DATA32_LANE(d1, d3, 0)
-    LOAD2_DATA32_LANE(d1, d3, 1)
-    "vshrn.i32   d22, q8, #9                   \n"
-    "vand.16     d22, d22, d30                 \n"
-    "vdup.8      d24, d22[0]                   \n"
-    "vdup.8      d25, d22[2]                   \n"
-    "vdup.8      d26, d22[4]                   \n"
-    "vdup.8      d27, d22[6]                   \n"
-    "vext.8      d4, d24, d25, #4              \n"
-    "vext.8      d5, d26, d27, #4              \n"  // f
-    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
-    "vmull.u8    q11, d0, d20                  \n"
-    "vmull.u8    q12, d1, d21                  \n"
-    "vmull.u8    q13, d2, d4                   \n"
-    "vmull.u8    q14, d3, d5                   \n"
-    "vadd.i16    q11, q11, q13                 \n"
-    "vadd.i16    q12, q12, q14                 \n"
-    "vshrn.i16   d0, q11, #7                   \n"
-    "vshrn.i16   d1, q12, #7                   \n"
-
-    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
-    "vadd.s32    q8, q8, q9                    \n"
-    "subs        %2, %2, #4                    \n"  // 4 processed per loop
-    "bgt         1b                            \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x),                // %3
-    "+r"(dx),               // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
deleted file mode 100644
index 0a7b80ce..00000000
--- a/files/source/scale_neon64.cc
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon armv8 64 bit.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into v0, odd into v1
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1"  // Clobber List
-  );
-}
-
-// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load even pixels into v0, odd into v1
-      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
-      "st1        {v0.16b}, [%1], #16            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1"  // Clobber List
-  );
-}
-
-// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst,
-                           int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
-      "1:                                        \n"
-      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-      "uaddlp     v1.8h, v1.16b                  \n"
-      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
-      "uadalp     v1.8h, v3.16b                  \n"
-      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
-      "rshrn2     v0.16b, v1.8h, #2              \n"
-      "st1        {v0.16b}, [%2], #16            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleRowDown4_NEON(const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t* dst_ptr,
-                        int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "st1     {v2.8b}, [%1], #8                 \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_ptr,
-                           int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
-      "1:                                        \n"
-      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
-      "ld1     {v1.16b}, [%2], #16               \n"
-      "ld1     {v2.16b}, [%3], #16               \n"
-      "ld1     {v3.16b}, [%4], #16               \n"
-      "subs    %w5, %w5, #4                      \n"
-      "uaddlp  v0.8h, v0.16b                     \n"
-      "uadalp  v0.8h, v1.16b                     \n"
-      "uadalp  v0.8h, v2.16b                     \n"
-      "uadalp  v0.8h, v3.16b                     \n"
-      "addp    v0.8h, v0.8h, v0.8h               \n"
-      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
-      "st1    {v0.s}[0], [%1], #4                \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_ptr1),  // %2
-        "+r"(src_ptr2),  // %3
-        "+r"(src_ptr3),  // %4
-        "+r"(dst_width)  // %5
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "subs      %w2, %w2, #24                           \n"
-      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      :
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "movi      v20.8b, #3                              \n"
-      "add       %3, %3, %0                              \n"
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
-      "subs         %w2, %w2, #24                        \n"
-
-      // filter src line 0 with src line 1
-      // expand chars to shorts to allow for room
-      // when adding lines together
-      "ushll     v16.8h, v4.8b, #0                       \n"
-      "ushll     v17.8h, v5.8b, #0                       \n"
-      "ushll     v18.8h, v6.8b, #0                       \n"
-      "ushll     v19.8h, v7.8b, #0                       \n"
-
-      // 3 * line_0 + line_1
-      "umlal     v16.8h, v0.8b, v20.8b                   \n"
-      "umlal     v17.8h, v1.8b, v20.8b                   \n"
-      "umlal     v18.8h, v2.8b, v20.8b                   \n"
-      "umlal     v19.8h, v3.8b, v20.8b                   \n"
-
-      // (3 * line_0 + line_1) >> 2
-      "uqrshrn   v0.8b, v16.8h, #2                       \n"
-      "uqrshrn   v1.8b, v17.8h, #2                       \n"
-      "uqrshrn   v2.8b, v18.8h, #2                       \n"
-      "uqrshrn   v3.8b, v19.8h, #2                       \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "ushll     v16.8h, v1.8b, #0                       \n"
-      "umlal     v16.8h, v0.8b, v20.8b                   \n"
-      "uqrshrn   v0.8b, v16.8h, #2                       \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "ushll     v16.8h, v2.8b, #0                       \n"
-      "umlal     v16.8h, v3.8b, v20.8b                   \n"
-      "uqrshrn   v2.8b, v16.8h, #2                       \n"
-
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v20", "memory", "cc");
-}
-
-void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  asm volatile(
-      "movi      v20.8b, #3                              \n"
-      "add       %3, %3, %0                              \n"
-      "1:                                                \n"
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
-      "subs         %w2, %w2, #24                        \n"
-      // average src line 0 with src line 1
-      "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-      "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-      "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-      "urhadd    v3.8b, v3.8b, v7.8b                     \n"
-
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
-      "ushll     v4.8h, v1.8b, #0                        \n"
-      "umlal     v4.8h, v0.8b, v20.8b                    \n"
-      "uqrshrn   v0.8b, v4.8h, #2                        \n"
-
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
-      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
-      "ushll     v4.8h, v2.8b, #0                        \n"
-      "umlal     v4.8h, v3.8b, v20.8b                    \n"
-      "uqrshrn   v2.8b, v4.8h, #2                        \n"
-
-      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "+r"(src_stride)  // %3
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
-}
-
-static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
-                              22, 24, 27, 30, 0,  0,  0,  0};
-static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
-                                34, 6,  22, 35, 0,  0,  0, 0};
-static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12, 65536 / 12,
-                                   65536 / 12, 65536 / 12};
-static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18, 65536 / 18,
-                                   65536 / 18, 65536 / 18};
-
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t* dst_ptr,
-                         int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "ld1       {v3.16b}, [%3]                          \n"
-      "1:                                                \n"
-      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
-      "subs      %w2, %w2, #12                           \n"
-      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
-      "st1       {v2.8b}, [%1], #8                       \n"
-      "st1       {v2.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(&kShuf38)    // %3
-      : "v0", "v1", "v2", "v3", "memory", "cc");
-}
-
-// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t* dst_ptr,
-                                      int dst_width) {
-  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-  ptrdiff_t tmp_src_stride = src_stride;
-
-  asm volatile(
-      "ld1       {v29.8h}, [%5]                          \n"
-      "ld1       {v30.16b}, [%6]                         \n"
-      "ld1       {v31.8h}, [%7]                          \n"
-      "add       %2, %2, %0                              \n"
-      "1:                                                \n"
-
-      // 00 40 01 41 02 42 03 43
-      // 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63
-      // 30 70 31 71 32 72 33 73
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
-      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
-      "subs      %w4, %w4, #12                           \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // 00 10 01 11 02 12 03 13
-      // 40 50 41 51 42 52 43 53
-      "trn1      v20.8b, v0.8b, v1.8b                    \n"
-      "trn2      v21.8b, v0.8b, v1.8b                    \n"
-      "trn1      v22.8b, v4.8b, v5.8b                    \n"
-      "trn2      v23.8b, v4.8b, v5.8b                    \n"
-      "trn1      v24.8b, v16.8b, v17.8b                  \n"
-      "trn2      v25.8b, v16.8b, v17.8b                  \n"
-
-      // 20 30 21 31 22 32 23 33
-      // 60 70 61 71 62 72 63 73
-      "trn1      v0.8b, v2.8b, v3.8b                     \n"
-      "trn2      v1.8b, v2.8b, v3.8b                     \n"
-      "trn1      v4.8b, v6.8b, v7.8b                     \n"
-      "trn2      v5.8b, v6.8b, v7.8b                     \n"
-      "trn1      v16.8b, v18.8b, v19.8b                  \n"
-      "trn2      v17.8b, v18.8b, v19.8b                  \n"
-
-      // 00+10 01+11 02+12 03+13
-      // 40+50 41+51 42+52 43+53
-      "uaddlp    v20.4h, v20.8b                          \n"
-      "uaddlp    v21.4h, v21.8b                          \n"
-      "uaddlp    v22.4h, v22.8b                          \n"
-      "uaddlp    v23.4h, v23.8b                          \n"
-      "uaddlp    v24.4h, v24.8b                          \n"
-      "uaddlp    v25.4h, v25.8b                          \n"
-
-      // 60+70 61+71 62+72 63+73
-      "uaddlp    v1.4h, v1.8b                            \n"
-      "uaddlp    v5.4h, v5.8b                            \n"
-      "uaddlp    v17.4h, v17.8b                          \n"
-
-      // combine source lines
-      "add       v20.4h, v20.4h, v22.4h                  \n"
-      "add       v21.4h, v21.4h, v23.4h                  \n"
-      "add       v20.4h, v20.4h, v24.4h                  \n"
-      "add       v21.4h, v21.4h, v25.4h                  \n"
-      "add       v2.4h, v1.4h, v5.4h                     \n"
-      "add       v2.4h, v2.4h, v17.4h                    \n"
-
-      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-      //             + s[6 + st * 1] + s[7 + st * 1]
-      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-      "xtn       v2.8b,  v2.8h                           \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-      "ushll     v16.8h, v16.8b, #0                      \n"
-      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-      // combine source lines
-      "add       v0.8h, v0.8h, v16.8h                    \n"
-
-      // xx 20 xx 21 xx 22 xx 23
-      // xx 30 xx 31 xx 32 xx 33
-      "trn1      v1.8h, v0.8h, v0.8h                     \n"
-      "trn2      v4.8h, v0.8h, v0.8h                     \n"
-      "xtn       v0.4h, v1.4s                            \n"
-      "xtn       v4.4h, v4.4s                            \n"
-
-      // 0+1+2, 3+4+5
-      "add       v20.8h, v20.8h, v0.8h                   \n"
-      "add       v21.8h, v21.8h, v4.8h                   \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
-
-      // Align for table lookup, vtbl requires registers to be adjacent
-      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
-      "st1       {v3.8b}, [%1], #8                       \n"
-      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(tmp_src_stride),  // %2
-        "+r"(src_ptr1),        // %3
-        "+r"(dst_width)        // %4
-      : "r"(&kMult38_Div6),    // %5
-        "r"(&kShuf38_2),       // %6
-        "r"(&kMult38_Div9)     // %7
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
-        "memory", "cc");
-}
-
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst_ptr,
-                               int dst_width) {
-  // TODO(fbarchard): use src_stride directly for clang 3.5+.
-  ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile(
-      "ld1       {v30.8h}, [%4]                          \n"
-      "ld1       {v31.16b}, [%5]                         \n"
-      "add       %2, %2, %0                              \n"
-      "1:                                                \n"
-
-      // 00 40 01 41 02 42 03 43
-      // 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63
-      // 30 70 31 71 32 72 33 73
-      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
-      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
-      "subs      %w3, %w3, #12                           \n"
-
-      // Shuffle the input data around to get align the data
-      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-      // 00 10 01 11 02 12 03 13
-      // 40 50 41 51 42 52 43 53
-      "trn1      v16.8b, v0.8b, v1.8b                    \n"
-      "trn2      v17.8b, v0.8b, v1.8b                    \n"
-      "trn1      v18.8b, v4.8b, v5.8b                    \n"
-      "trn2      v19.8b, v4.8b, v5.8b                    \n"
-
-      // 20 30 21 31 22 32 23 33
-      // 60 70 61 71 62 72 63 73
-      "trn1      v0.8b, v2.8b, v3.8b                     \n"
-      "trn2      v1.8b, v2.8b, v3.8b                     \n"
-      "trn1      v4.8b, v6.8b, v7.8b                     \n"
-      "trn2      v5.8b, v6.8b, v7.8b                     \n"
-
-      // 00+10 01+11 02+12 03+13
-      // 40+50 41+51 42+52 43+53
-      "uaddlp    v16.4h, v16.8b                          \n"
-      "uaddlp    v17.4h, v17.8b                          \n"
-      "uaddlp    v18.4h, v18.8b                          \n"
-      "uaddlp    v19.4h, v19.8b                          \n"
-
-      // 60+70 61+71 62+72 63+73
-      "uaddlp    v1.4h, v1.8b                            \n"
-      "uaddlp    v5.4h, v5.8b                            \n"
-
-      // combine source lines
-      "add       v16.4h, v16.4h, v18.4h                  \n"
-      "add       v17.4h, v17.4h, v19.4h                  \n"
-      "add       v2.4h, v1.4h, v5.4h                     \n"
-
-      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-      "uqrshrn   v2.8b, v2.8h, #2                        \n"
-
-      // Shuffle 2,3 reg around so that 2 can be added to the
-      //  0,1 reg and 3 can be added to the 4,5 reg. This
-      //  requires expanding from u8 to u16 as the 0,1 and 4,5
-      //  registers are already expanded. Then do transposes
-      //  to get aligned.
-      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
-      // combine source lines
-      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-      // xx 20 xx 21 xx 22 xx 23
-      // xx 30 xx 31 xx 32 xx 33
-      "trn1      v1.8h, v0.8h, v0.8h                     \n"
-      "trn2      v4.8h, v0.8h, v0.8h                     \n"
-      "xtn       v0.4h, v1.4s                            \n"
-      "xtn       v4.4h, v4.4s                            \n"
-
-      // 0+1+2, 3+4+5
-      "add       v16.8h, v16.8h, v0.8h                   \n"
-      "add       v17.8h, v17.8h, v4.8h                   \n"
-
-      // Need to divide, but can't downshift as the the value
-      //  isn't a power of 2. So multiply by 65536 / n
-      //  and take the upper 16 bits.
-      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
-
-      // Align for table lookup, vtbl requires registers to
-      //  be adjacent
-
-      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
-      "st1       {v3.8b}, [%1], #8                       \n"
-      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(tmp_src_stride),  // %2
-        "+r"(dst_width)        // %3
-      : "r"(&kMult38_Div6),    // %4
-        "r"(&kShuf38_2)        // %5
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v30", "v31", "memory", "cc");
-}
-
-// Add a row of bytes to a row of shorts.  Used for box filter.
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-void ScaleAddRow_NEON(const uint8_t* src_ptr,
-                      uint16_t* dst_ptr,
-                      int src_width) {
-  asm volatile(
-      "1:                                        \n"
-      "ld1      {v1.8h, v2.8h}, [%1]             \n"  // load accumulator
-      "ld1      {v0.16b}, [%0], #16              \n"  // load 16 bytes
-      "uaddw2   v2.8h, v2.8h, v0.16b             \n"  // add
-      "uaddw    v1.8h, v1.8h, v0.8b              \n"
-      "st1      {v1.8h, v2.8h}, [%1], #32        \n"  // store accumulator
-      "subs     %w2, %w2, #16                    \n"  // 16 processed per loop
-      "b.gt     1b                               \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                      \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5                     \n" \
-  "add        %3, %3, %4                     \n" \
-  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
-
-// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
-//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-
-void ScaleFilterCols_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_ptr;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v1.4s, v1.4s, v0.4s            \n"
-    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
-    "add        v2.4s, v1.4s, v3.4s            \n"
-    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
-  "1:                                          \n"
-    LOAD2_DATA8_LANE(0)
-    LOAD2_DATA8_LANE(1)
-    LOAD2_DATA8_LANE(2)
-    LOAD2_DATA8_LANE(3)
-    LOAD2_DATA8_LANE(4)
-    LOAD2_DATA8_LANE(5)
-    LOAD2_DATA8_LANE(6)
-    LOAD2_DATA8_LANE(7)
-    "mov       v6.16b, v1.16b                  \n"
-    "mov       v7.16b, v2.16b                  \n"
-    "uzp1      v6.8h, v6.8h, v7.8h             \n"
-    "ushll     v4.8h, v4.8b, #0                \n"
-    "ushll     v5.8h, v5.8b, #0                \n"
-    "ssubl     v16.4s, v5.4h, v4.4h            \n"
-    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
-    "ushll     v7.4s, v6.4h, #0                \n"
-    "ushll2    v6.4s, v6.8h, #0                \n"
-    "mul       v16.4s, v16.4s, v7.4s           \n"
-    "mul       v17.4s, v17.4s, v6.4s           \n"
-    "rshrn     v6.4h, v16.4s, #16              \n"
-    "rshrn2    v6.8h, v17.4s, #16              \n"
-    "add       v4.8h, v4.8h, v6.8h             \n"
-    "xtn       v4.8b, v4.8h                    \n"
-
-    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
-    "add       v1.4s, v1.4s, v0.4s             \n"
-    "add       v2.4s, v2.4s, v0.4s             \n"
-    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3",
-    "v4", "v5", "v6", "v7", "v16", "v17"
-  );
-}
-
-#undef LOAD2_DATA8_LANE
-
-// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8_t* dst_ptr,
-                          const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction) {
-  int y_fraction = 256 - source_y_fraction;
-  asm volatile(
-      "cmp          %w4, #0                      \n"
-      "b.eq         100f                         \n"
-      "add          %2, %2, %1                   \n"
-      "cmp          %w4, #64                     \n"
-      "b.eq         75f                          \n"
-      "cmp          %w4, #128                    \n"
-      "b.eq         50f                          \n"
-      "cmp          %w4, #192                    \n"
-      "b.eq         25f                          \n"
-
-      "dup          v5.8b, %w4                   \n"
-      "dup          v4.8b, %w5                   \n"
-      // General purpose row blend.
-      "1:                                        \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "umull        v6.8h, v0.8b, v4.8b          \n"
-      "umull2       v7.8h, v0.16b, v4.16b        \n"
-      "umlal        v6.8h, v1.8b, v5.8b          \n"
-      "umlal2       v7.8h, v1.16b, v5.16b        \n"
-      "rshrn        v0.8b, v6.8h, #8             \n"
-      "rshrn2       v0.16b, v7.8h, #8            \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         1b                           \n"
-      "b            99f                          \n"
-
-      // Blend 25 / 75.
-      "25:                                       \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         25b                          \n"
-      "b            99f                          \n"
-
-      // Blend 50 / 50.
-      "50:                                       \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "ld1          {v1.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         50b                          \n"
-      "b            99f                          \n"
-
-      // Blend 75 / 25.
-      "75:                                       \n"
-      "ld1          {v1.16b}, [%1], #16          \n"
-      "ld1          {v0.16b}, [%2], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "urhadd       v0.16b, v0.16b, v1.16b       \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         75b                          \n"
-      "b            99f                          \n"
-
-      // Blend 100 / 0 - Copy row unchanged.
-      "100:                                      \n"
-      "ld1          {v0.16b}, [%1], #16          \n"
-      "subs         %w3, %w3, #16                \n"
-      "st1          {v0.16b}, [%0], #16          \n"
-      "b.gt         100b                         \n"
-
-      "99:                                       \n"
-      "st1          {v0.b}[15], [%0]             \n"
-      : "+r"(dst_ptr),            // %0
-        "+r"(src_ptr),            // %1
-        "+r"(src_stride),         // %2
-        "+r"(dst_width),          // %3
-        "+r"(source_y_fraction),  // %4
-        "+r"(y_fraction)          // %5
-      :
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
-}
-
-void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t* dst,
-                            int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "mov        v2.16b, v3.16b                 \n"
-      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst),       // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
-      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-
-      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
-      "urhadd     v1.16b, v2.16b, v3.16b         \n"
-      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t* dst,
-                               int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add        %1, %1, %0                     \n"
-      "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
-      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
-      "rshrn      v1.8b, v1.8h, #2               \n"
-      "rshrn      v2.8b, v2.8h, #2               \n"
-      "rshrn      v3.8b, v3.8h, #2               \n"
-      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
-                               ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8_t* dst_argb,
-                               int dst_width) {
-  (void)src_stride;
-  asm volatile(
-      "1:                                        \n"
-      "ld1        {v0.s}[0], [%0], %3            \n"
-      "ld1        {v0.s}[1], [%0], %3            \n"
-      "ld1        {v0.s}[2], [%0], %3            \n"
-      "ld1        {v0.s}[3], [%0], %3            \n"
-      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-      "st1        {v0.16b}, [%1], #16            \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_argb),                // %1
-        "+r"(dst_width)                // %2
-      : "r"((int64_t)(src_stepx * 4))  // %3
-      : "memory", "cc", "v0");
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-// TODO(Yang Zhang): Might be worth another optimization pass in future.
-// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8_t* dst_argb,
-                                  int dst_width) {
-  asm volatile(
-      "add        %1, %1, %0                     \n"
-      "1:                                        \n"
-      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
-      "ld1        {v1.8b}, [%1], %4              \n"
-      "ld1        {v2.8b}, [%0], %4              \n"
-      "ld1        {v3.8b}, [%1], %4              \n"
-      "ld1        {v4.8b}, [%0], %4              \n"
-      "ld1        {v5.8b}, [%1], %4              \n"
-      "ld1        {v6.8b}, [%0], %4              \n"
-      "ld1        {v7.8b}, [%1], %4              \n"
-      "uaddl      v0.8h, v0.8b, v1.8b            \n"
-      "uaddl      v2.8h, v2.8b, v3.8b            \n"
-      "uaddl      v4.8h, v4.8b, v5.8b            \n"
-      "uaddl      v6.8h, v6.8b, v7.8b            \n"
-      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-      "mov        v0.d[1], v2.d[0]               \n"
-      "mov        v2.d[0], v16.d[1]              \n"
-      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-      "mov        v4.d[1], v6.d[0]               \n"
-      "mov        v6.d[0], v16.d[1]              \n"
-      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-      "st1     {v0.16b}, [%2], #16               \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_argb),                // %0
-        "+r"(src_stride),              // %1
-        "+r"(dst_argb),                // %2
-        "+r"(dst_width)                // %3
-      : "r"((int64_t)(src_stepx * 4))  // %4
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
-}
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                 \
-  "lsr        %5, %3, #16                    \n" \
-  "add        %6, %1, %5, lsl #2             \n" \
-  "add        %3, %3, %4                     \n" \
-  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
-
-void ScaleARGBCols_NEON(uint8_t* dst_argb,
-                        const uint8_t* src_argb,
-                        int dst_width,
-                        int x,
-                        int dx) {
-  const uint8_t* src_tmp = src_argb;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  int64_t tmp64;
-  asm volatile(
-      "1:                                        \n"
-      // clang-format off
-      LOAD1_DATA32_LANE(v0, 0)
-      LOAD1_DATA32_LANE(v0, 1)
-      LOAD1_DATA32_LANE(v0, 2)
-      LOAD1_DATA32_LANE(v0, 3)
-      LOAD1_DATA32_LANE(v1, 0)
-      LOAD1_DATA32_LANE(v1, 1)
-      LOAD1_DATA32_LANE(v1, 2)
-      LOAD1_DATA32_LANE(v1, 3)
-      // clang-format on
-      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "b.gt       1b                             \n"
-      : "+r"(dst_argb),   // %0
-        "+r"(src_argb),   // %1
-        "+r"(dst_width),  // %2
-        "+r"(x64),        // %3
-        "+r"(dx64),       // %4
-        "=&r"(tmp64),     // %5
-        "+r"(src_tmp)     // %6
-      :
-      : "memory", "cc", "v0", "v1");
-}
-
-#undef LOAD1_DATA32_LANE
-
-// TODO(Yang Zhang): Investigate less load instructions for
-// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
-  "lsr        %5, %3, #16                           \n" \
-  "add        %6, %1, %5, lsl #2                    \n" \
-  "add        %3, %3, %4                            \n" \
-  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
-
-void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
-                              const uint8_t* src_argb,
-                              int dst_width,
-                              int x,
-                              int dx) {
-  int dx_offset[4] = {0, 1, 2, 3};
-  int* tmp = dx_offset;
-  const uint8_t* src_tmp = src_argb;
-  int64_t x64 = (int64_t)x;    // NOLINT
-  int64_t dx64 = (int64_t)dx;  // NOLINT
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // x
-    "dup        v1.4s, %w4                     \n"  // dx
-    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
-    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
-    "mul        v1.4s, v1.4s, v2.4s            \n"
-    "movi       v3.16b, #0x7f                  \n"  // 0x7F
-    "movi       v4.8h, #0x7f                   \n"  // 0x7F
-    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
-    "add        v5.4s, v1.4s, v0.4s            \n"
-  "1:                                          \n"
-    // d0, d1: a
-    // d2, d3: b
-    LOAD2_DATA32_LANE(v0, v1, 0)
-    LOAD2_DATA32_LANE(v0, v1, 1)
-    LOAD2_DATA32_LANE(v0, v1, 2)
-    LOAD2_DATA32_LANE(v0, v1, 3)
-    "shrn       v2.4h, v5.4s, #9               \n"
-    "and        v2.8b, v2.8b, v4.8b            \n"
-    "dup        v16.8b, v2.b[0]                \n"
-    "dup        v17.8b, v2.b[2]                \n"
-    "dup        v18.8b, v2.b[4]                \n"
-    "dup        v19.8b, v2.b[6]                \n"
-    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
-    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
-    "ins        v2.d[1], v17.d[0]              \n"  // f
-    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
-    "umull      v16.8h, v0.8b, v7.8b           \n"
-    "umull2     v17.8h, v0.16b, v7.16b         \n"
-    "umull      v18.8h, v1.8b, v2.8b           \n"
-    "umull2     v19.8h, v1.16b, v2.16b         \n"
-    "add        v16.8h, v16.8h, v18.8h         \n"
-    "add        v17.8h, v17.8h, v19.8h         \n"
-    "shrn       v0.8b, v16.8h, #7              \n"
-    "shrn2      v0.16b, v17.8h, #7             \n"
-
-    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
-    "add     v5.4s, v5.4s, v6.4s               \n"
-    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
-  : "+r"(dst_argb),         // %0
-    "+r"(src_argb),         // %1
-    "+r"(dst_width),        // %2
-    "+r"(x64),              // %3
-    "+r"(dx64),             // %4
-    "+r"(tmp),              // %5
-    "+r"(src_tmp)           // %6
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
-    "v6", "v7", "v16", "v17", "v18", "v19"
-  );
-}
-
-#undef LOAD2_DATA32_LANE
-
-// Read 16x2 average down and write 8x1.
-void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint16_t* dst,
-                              int dst_width) {
-  asm volatile(
-      // change the stride to row 2 pointer
-      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
-      "1:                                        \n"
-      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
-      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
-      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
-      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
-      "uaddlp     v1.4s, v1.8h                   \n"
-      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
-      "uadalp     v1.4s, v3.8h                   \n"
-      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
-      "rshrn2     v0.8h, v1.4s, #2               \n"
-      "st1        {v0.8h}, [%2], #16             \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      :
-      : "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// Actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         uint16_t* dst,
-                         int dst_width) {
-  asm volatile(
-      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
-      "movi       v0.8h, #9                      \n"  // constants
-      "movi       v1.4s, #3                      \n"
-
-      "1:                                        \n"
-      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
-      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
-      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
-      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
-      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
-      "umull      v16.4s, v3.4h, v0.4h           \n"
-      "umull2     v7.4s, v3.8h, v0.8h            \n"
-      "umull      v18.4s, v4.4h, v0.4h           \n"
-      "umull2     v17.4s, v4.8h, v0.8h           \n"
-      "uaddw      v16.4s, v16.4s, v6.4h          \n"
-      "uaddl2     v19.4s, v6.8h, v3.8h           \n"
-      "uaddl      v3.4s, v6.4h, v3.4h            \n"
-      "uaddw2     v6.4s, v7.4s, v6.8h            \n"
-      "uaddl2     v7.4s, v5.8h, v4.8h            \n"
-      "uaddl      v4.4s, v5.4h, v4.4h            \n"
-      "uaddw      v18.4s, v18.4s, v5.4h          \n"
-      "mla        v16.4s, v4.4s, v1.4s           \n"
-      "mla        v18.4s, v3.4s, v1.4s           \n"
-      "mla        v6.4s, v7.4s, v1.4s            \n"
-      "uaddw2     v4.4s, v17.4s, v5.8h           \n"
-      "uqrshrn    v16.4h,  v16.4s, #4            \n"
-      "mla        v4.4s, v19.4s, v1.4s           \n"
-      "uqrshrn2   v16.8h, v6.4s, #4              \n"
-      "uqrshrn    v17.4h, v18.4s, #4             \n"
-      "uqrshrn2   v17.8h, v4.4s, #4              \n"
-      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
-      "b.gt       1b                             \n"
-      : "+r"(src_ptr),     // %0
-        "+r"(src_stride),  // %1
-        "+r"(dst),         // %2
-        "+r"(dst_width)    // %3
-      : "r"(2LL),          // %4
-        "r"(14LL)          // %5
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19"  // Clobber List
-  );
-}
-
-#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/files/sync_chromium.py b/files/sync_chromium.py
deleted file mode 100755
index 4e51b6bd..00000000
--- a/files/sync_chromium.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Script to download a Chromium checkout into the workspace.
-
-The script downloads a full Chromium Git clone and its DEPS.
-
-The following environment variable can be used to alter the behavior:
-* CHROMIUM_NO_HISTORY - If set to 1, a Git checkout with no history will be
-  downloaded. This is consumes less bandwidth and disk space but is known to be
-  slower in general if you have a high-speed connection.
-
-After a successful sync has completed, a .last_sync_chromium file is written to
-the chromium directory. While it exists, no more gclient sync operations will be
-performed until the --target-revision changes or the SCRIPT_VERSION constant is
-incremented. The file can be removed manually to force a new sync.
-"""
-
-import argparse
-import os
-import subprocess
-import sys
-
-# Bump this whenever the algorithm changes and you need bots/devs to re-sync,
-# ignoring the .last_sync_chromium file
-SCRIPT_VERSION = 4
-
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHROMIUM_NO_HISTORY = 'CHROMIUM_NO_HISTORY'
-
-def _parse_gclient_dict():
-  gclient_dict = {}
-  try:
-    main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient')
-    with open(main_gclient, 'rb') as deps_content:
-      exec(deps_content, gclient_dict)
-  except Exception as e:
-    print >> sys.stderr, 'error while parsing .gclient:', e
-  return gclient_dict
-
-
-def get_cache_dir():
-  return _parse_gclient_dict().get('cache_dir')
-
-
-def get_target_os_list():
-  return ','.join(_parse_gclient_dict().get('target_os', []))
-
-
-def main():
-  CR_DIR = os.path.join(ROOT_DIR, 'chromium')
-
-  p = argparse.ArgumentParser()
-  p.add_argument('--target-revision', required=True,
-                 help='The target chromium git revision [REQUIRED]')
-  p.add_argument('--chromium-dir', default=CR_DIR,
-                 help=('The path to the chromium directory to sync '
-                       '(default: %(default)r)'))
-  opts = p.parse_args()
-  opts.chromium_dir = os.path.abspath(opts.chromium_dir)
-
-  target_os_list = get_target_os_list()
-
-  # Do a quick check to see if we were successful last time to make runhooks
-  # sooper fast.
-  flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium')
-  flag_file_content = '\n'.join([
-    str(SCRIPT_VERSION),
-    opts.target_revision,
-    repr(target_os_list),
-  ])
-  if (os.path.exists(os.path.join(opts.chromium_dir, 'src')) and
-      os.path.exists(flag_file)):
-    with open(flag_file, 'r') as f:
-      if f.read() == flag_file_content:
-        print 'Chromium already up to date: ', opts.target_revision
-        return 0
-    os.unlink(flag_file)
-
-  env = os.environ.copy()
-
-  # Avoid downloading NaCl toolchain as part of the Chromium hooks.
-  env['GYP_CHROMIUM_NO_ACTION'] = '1'
-  gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient'
-  args = [
-      gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision
-  ]
-
-  if os.environ.get('CHROME_HEADLESS') == '1':
-    # Running on a buildbot.
-    args.append('-vvv')
-
-    if sys.platform.startswith('win'):
-      cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep,
-                                'b', 'git-cache')
-    else:
-      cache_path = '/b/git-cache'
-  else:
-    # Support developers setting the cache_dir in .gclient.
-    cache_path = get_cache_dir()
-
-  # Allow for users with poor internet connections to download a Git clone
-  # without history (saves several gigs but is generally slower and doesn't work
-  # with the Git cache).
-  if os.environ.get(CHROMIUM_NO_HISTORY) == '1':
-    if cache_path:
-      print >> sys.stderr, (
-          'You cannot use "no-history" mode for syncing Chrome (i.e. set the '
-          '%s environment variable to 1) when you have cache_dir configured in '
-          'your .gclient.' % CHROMIUM_NO_HISTORY)
-      return 1
-    args.append('--no-history')
-    gclient_entries_file = os.path.join(opts.chromium_dir, '.gclient_entries')
-  else:
-    # Write a temporary .gclient file that has the cache_dir variable added.
-    gclientfile = os.path.join(opts.chromium_dir, '.gclient')
-    with open(gclientfile, 'rb') as spec:
-      spec = spec.read().splitlines()
-      spec[-1] = 'cache_dir = %r' % (cache_path,)
-    with open(gclientfile + '.tmp', 'wb') as f:
-      f.write('\n'.join(spec))
-
-    args += [
-      '--gclientfile', '.gclient.tmp',
-      '--delete_unversioned_trees', '--reset', '--upstream'
-    ]
-    gclient_entries_file = os.path.join(opts.chromium_dir,
-                                        '.gclient.tmp_entries')
-
-  # To avoid gclient sync problems when DEPS entries have been removed we must
-  # wipe the gclient's entries file that contains cached URLs for all DEPS.
-  if os.path.exists(gclient_entries_file):
-    os.unlink(gclient_entries_file)
-
-  if target_os_list:
-    args += ['--deps=' + target_os_list]
-
-  print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir)
-  ret = subprocess.call(args, cwd=opts.chromium_dir, env=env)
-  if ret == 0:
-    with open(flag_file, 'wb') as f:
-      f.write(flag_file_content)
-
-  return ret
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/files/third_party/gflags/BUILD.gn b/files/third_party/gflags/BUILD.gn
deleted file mode 100644
index af41b7ec..00000000
--- a/files/third_party/gflags/BUILD.gn
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-if (is_win) {
-  gflags_gen_arch_root = "gen/win"
-} else {
-  gflags_gen_arch_root = "gen/posix"
-}
-
-config("gflags_config") {
-  include_dirs = [
-    "$gflags_gen_arch_root/include",  # For configured files.
-    "src/src",  # For everything else.
-  ]
-
-  defines = [
-    # These macros exist so flags and symbols are properly exported when
-    # building DLLs. Since we don't build DLLs, we need to disable them.
-    "GFLAGS_DLL_DECL=",
-    "GFLAGS_DLL_DECLARE_FLAG=",
-    "GFLAGS_DLL_DEFINE_FLAG=",
-  ]
-
-  # GN orders flags on a target before flags from configs. The default config
-  # adds -Wall, and this flag have to be after -Wall -- so they need to
-  # come from a config and can't be on the target directly.
-  if (is_clang) {
-    cflags = [ "-Wno-unused-local-typedef" ]
-  }
-}
-
-source_set("gflags") {
-  cflags = []
-  sources = [
-    "src/src/gflags.cc",
-    "src/src/gflags_completions.cc",
-    "src/src/gflags_reporting.cc",
-  ]
-  if (is_win) {
-    sources += [ "src/src/windows_port.cc" ]
-
-    cflags += [
-      "/wd4005",  # WIN32_LEAN_AND_MEAN.
-      "/wd4267",  # Conversion from size_t to "type".
-    ]
-  }
-
-  include_dirs = [
-    "$gflags_gen_arch_root/include/gflags",  # For configured files.
-    "$gflags_gen_arch_root/include/private",  # For config.h
-  ]
-
-  public_configs = [ ":gflags_config" ]
-
-  configs -= [ "//build/config/compiler:chromium_code" ]
-  configs += [ "//build/config/compiler:no_chromium_code" ]
-
-  if (is_win) {
-    configs -= [ "//build/config/win:unicode" ]
-  }
-
-  if (is_clang) {
-    # TODO(andrew): Look into fixing this warning upstream:
-    # http://code.google.com/p/webrtc/issues/detail?id=760
-    configs -= [ "//build/config/clang:extra_warnings" ]
-    cflags += [ "-Wno-microsoft-include" ]
-  }
-}
diff --git a/files/third_party/gflags/LICENSE b/files/third_party/gflags/LICENSE
deleted file mode 100644
index d15b0c24..00000000
--- a/files/third_party/gflags/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2006, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/files/third_party/gflags/README.libyuv b/files/third_party/gflags/README.libyuv
deleted file mode 100644
index 5b3bc2db..00000000
--- a/files/third_party/gflags/README.libyuv
+++ /dev/null
@@ -1,28 +0,0 @@
-URL: https://github.com/gflags/gflags
-Version: 2.1.2
-License: New BSD
-License File: LICENSE
-
-Description:
-The gflags package contains a library that implements commandline
-flags processing. As such it's a replacement for getopt(). It has
-increased flexibility, including built-in support for C++ types like
-string, and the ability to define flags in the source file in which
-they're used.
-
-Local Modifications: None
-
-
-How to update platform configuration files:
-The gen/ directory contains pre-generated configuration header files.
-Historically, all operating systems and architectures have generated
-similar configurations except for Windows. This is why there's only
-posix and win directories below gen/.
-When rolling gflags to a newer version, it's a good idea to check if
-new configuration files needs to be generated as well.
-Do this by running ./configure in the newly checked out version of
-gflags. Then diff the generated files with the ones below gen/.
-If you notice a diff, update the files with the updated ones.
-If you suspect platform dependend changes other than Windows, you'll
-have to checkout gflags on the other platforms as well and run
-./configure there too.
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags.h
deleted file mode 100644
index 0db38f5c..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags.h
+++ /dev/null
@@ -1,573 +0,0 @@
-// Copyright (c) 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// or defines a command line flag or wants to parse command line flags
-// or print a program usage message (which will include information about
-// flags).  Executive summary, in the form of an example foo.cc file:
-//
-//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
-//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
-//
-//    DEFINE_int32(end, 1000, "The last record to read");
-//
-//    DEFINE_string(filename, "my_file.txt", "The file to read");
-//    // Crash if the specified file does not exist.
-//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
-//                                              &ValidateIsFile);
-//
-//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
-//
-//    void MyFunc() {
-//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
-//    }
-//
-//    Then, at the command-line:
-//       ./foo --noverbose --start=5 --end=100
-//
-// For more details, see
-//    doc/gflags.html
-//
-// --- A note about thread-safety:
-//
-// We describe many functions in this routine as being thread-hostile,
-// thread-compatible, or thread-safe.  Here are the meanings we use:
-//
-// thread-safe: it is safe for multiple threads to call this routine
-//   (or, when referring to a class, methods of this class)
-//   concurrently.
-// thread-hostile: it is not safe for multiple threads to call this
-//   routine (or methods of this class) concurrently.  In gflags,
-//   most thread-hostile routines are intended to be called early in,
-//   or even before, main() -- that is, before threads are spawned.
-// thread-compatible: it is safe for multiple threads to read from
-//   this variable (when applied to variables), or to call const
-//   methods of this class (when applied to classes), as long as no
-//   other thread is writing to the variable or calling non-const
-//   methods of this class.
-
-#ifndef GFLAGS_GFLAGS_H_
-#define GFLAGS_GFLAGS_H_
-
-#include <string>
-#include <vector>
-
-#include "gflags_declare.h" // IWYU pragma: export
-
-
-// We always want to export variables defined in user code
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-#  ifdef _MSC_VER
-#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
-#  else
-#    define GFLAGS_DLL_DEFINE_FLAG
-#  endif
-#endif
-
-
-namespace GFLAGS_NAMESPACE {
-
-
-// --------------------------------------------------------------------
-// To actually define a flag in a file, use DEFINE_bool,
-// DEFINE_string, etc. at the bottom of this file.  You may also find
-// it useful to register a validator with the flag.  This ensures that
-// when the flag is parsed from the commandline, or is later set via
-// SetCommandLineOption, we call the validation function. It is _not_
-// called when you assign the value to the flag directly using the = operator.
-//
-// The validation function should return true if the flag value is valid, and
-// false otherwise. If the function returns false for the new setting of the
-// flag, the flag will retain its current value. If it returns false for the
-// default value, ParseCommandLineFlags() will die.
-//
-// This function is safe to call at global construct time (as in the
-// example below).
-//
-// Example use:
-//    static bool ValidatePort(const char* flagname, int32 value) {
-//       if (value > 0 && value < 32768)   // value is ok
-//         return true;
-//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
-//       return false;
-//    }
-//    DEFINE_int32(port, 0, "What port to listen on");
-//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
-
-// Returns true if successfully registered, false if not (because the
-// first argument doesn't point to a command-line flag, or because a
-// validator is already registered for this flag).
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
-
-// Convenience macro for the registration of a flag validator
-#define DEFINE_validator(name, validator) \
-    static const bool name##_validator_registered = \
-            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
-
-
-// --------------------------------------------------------------------
-// These methods are the best way to get access to info about the
-// list of commandline flags.  Note that these routines are pretty slow.
-//   GetAllFlags: mostly-complete info about the list, sorted by file.
-//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
-//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
-//
-// In addition to accessing flags, you can also access argv[0] (the program
-// name) and argv (the entire commandline), which we sock away a copy of.
-// These variables are static, so you should only set them once.
-//
-// No need to export this data only structure from DLL, avoiding VS warning 4251.
-struct CommandLineFlagInfo {
-  std::string name;            // the name of the flag
-  std::string type;            // the type of the flag: int32, etc
-  std::string description;     // the "help text" associated with the flag
-  std::string current_value;   // the current value, as a string
-  std::string default_value;   // the default value, as a string
-  std::string filename;        // 'cleaned' version of filename holding the flag
-  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
-  bool is_default;             // true if the flag has the default value and
-                               // has not been set explicitly from the cmdline
-                               // or via SetCommandLineOption
-  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
-};
-
-// Using this inside of a validator is a recipe for a deadlock.
-// TODO(user) Fix locking when validators are running, to make it safe to
-// call validators during ParseAllFlags.
-// Also make sure then to uncomment the corresponding unit test in
-// gflags_unittest.sh
-extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
-// These two are actually defined in gflags_reporting.cc.
-extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
-extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
-
-// Create a descriptive string for a flag.
-// Goes to some trouble to make pretty line breaks.
-extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
-
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
-
-// The following functions are thread-safe as long as SetArgv() is
-// only called before any threads start.
-extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
-extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
-extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
-extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
-extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
-extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
-
-// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
-
-// VersionString() is thread-safe as long as SetVersionString() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
-
-
-
-// --------------------------------------------------------------------
-// Normally you access commandline flags by just saying "if (FLAGS_foo)"
-// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
-// commonly, via the DEFINE_foo macro).  But if you need a bit more
-// control, we have programmatic ways to get/set the flags as well.
-// These programmatic ways to access flags are thread-safe, but direct
-// access is only thread-compatible.
-
-// Return true iff the flagname was found.
-// OUTPUT is set to the flag's value, or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
-
-// Return true iff the flagname was found. OUTPUT is set to the flag's
-// CommandLineFlagInfo or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
-
-// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
-// Example usage, to check if a flag's value is currently the default value:
-//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
-extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
-
-enum GFLAGS_DLL_DECL FlagSettingMode {
-  // update the flag's value (can call this multiple times).
-  SET_FLAGS_VALUE,
-  // update the flag's value, but *only if* it has not yet been updated
-  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
-  SET_FLAG_IF_DEFAULT,
-  // set the flag's default value to this.  If the flag has not yet updated
-  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
-  // change the flag's current value to the new default value as well.
-  SET_FLAGS_DEFAULT
-};
-
-// Set a particular flag ("command line option").  Returns a string
-// describing the new value that the option has been set to.  The
-// return value API is not well-specified, so basically just depend on
-// it to be empty if the setting failed for some reason -- the name is
-// not a valid flag name, or the value is not a valid value -- and
-// non-empty else.
-
-// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
-extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
-extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
-
-
-// --------------------------------------------------------------------
-// Saves the states (value, default value, whether the user has set
-// the flag, registered validators, etc) of all flags, and restores
-// them when the FlagSaver is destroyed.  This is very useful in
-// tests, say, when you want to let your tests change the flags, but
-// make sure that they get reverted to the original states when your
-// test is complete.
-//
-// Example usage:
-//   void TestFoo() {
-//     FlagSaver s1;
-//     FLAG_foo = false;
-//     FLAG_bar = "some value";
-//
-//     // test happens here.  You can return at any time
-//     // without worrying about restoring the FLAG values.
-//   }
-//
-// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
-// the work is done in the constructor and destructor, so in the standard
-// usage example above, the compiler would complain that it's an
-// unused variable.
-//
-// This class is thread-safe.  However, its destructor writes to
-// exactly the set of flags that have changed value during its
-// lifetime, so concurrent _direct_ access to those flags
-// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
-
-class GFLAGS_DLL_DECL FlagSaver {
- public:
-  FlagSaver();
-  ~FlagSaver();
-
- private:
-  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
-
-  FlagSaver(const FlagSaver&);  // no copying!
-  void operator=(const FlagSaver&);
-}__attribute((unused));
-
-// --------------------------------------------------------------------
-// Some deprecated or hopefully-soon-to-be-deprecated functions.
-
-// This is often used for logging.  TODO(csilvers): figure out a better way
-extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
-// Usually where this is used, a FlagSaver should be used instead.
-extern GFLAGS_DLL_DECL
-bool ReadFlagsFromString(const std::string& flagfilecontents,
-                         const char* prog_name,
-                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
-
-// These let you manually implement --flagfile functionality.
-// DEPRECATED.
-extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
-extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
-
-
-// --------------------------------------------------------------------
-// Useful routines for initializing flags from the environment.
-// In each case, if 'varname' does not exist in the environment
-// return defval.  If 'varname' does exist but is not valid
-// (e.g., not a number for an int32 flag), abort with an error.
-// Otherwise, return the value.  NOTE: for booleans, for true use
-// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
-
-extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
-extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
-extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
-extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
-extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
-extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
-
-
-// --------------------------------------------------------------------
-// The next two functions parse gflags from main():
-
-// Set the "usage" message for this program.  For example:
-//   string usage("This program does nothing.  Sample usage:\n");
-//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
-//   SetUsageMessage(usage);
-// Do not include commandline flags in the usage: we do that for you!
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
-
-// Sets the version string, which is emitted with --version.
-// For instance: SetVersionString("1.3");
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
-
-
-// Looks for flags in argv and parses them.  Rearranges argv to put
-// flags first, or removes them entirely if remove_flags is true.
-// If a flag is defined more than once in the command line or flag
-// file, the last definition is used.  Returns the index (into argv)
-// of the first non-flag argument.
-// See top-of-file for more details on this function.
-#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
-#endif
-
-
-// Calls to ParseCommandLineNonHelpFlags and then to
-// HandleCommandLineHelpFlags can be used instead of a call to
-// ParseCommandLineFlags during initialization, in order to allow for
-// changing default values for some FLAGS (via
-// e.g. SetCommandLineOptionWithMode calls) between the time of
-// command line parsing and the time of dumping help information for
-// the flags as a result of command line parsing.  If a flag is
-// defined more than once in the command line or flag file, the last
-// definition is used.  Returns the index (into argv) of the first
-// non-flag argument.  (If remove_flags is true, will always return 1.)
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
-
-// This is actually defined in gflags_reporting.cc.
-// This function is misnamed (it also handles --version, etc.), but
-// it's too late to change that now. :-(
-extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
-
-// Allow command line reparsing.  Disables the error normally
-// generated when an unknown flag is found, since it may be found in a
-// later parse.  Thread-hostile; meant to be called before any threads
-// are spawned.
-extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
-
-// Reparse the flags that have not yet been recognized.  Only flags
-// registered since the last parse will be recognized.  Any flag value
-// must be provided as part of the argument using "=", not as a
-// separate command line argument that follows the flag argument.
-// Intended for handling flags from dynamically loaded libraries,
-// since their flags are not registered until they are loaded.
-extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
-
-// Clean up memory allocated by flags.  This is only needed to reduce
-// the quantity of "potentially leaked" reports emitted by memory
-// debugging tools such as valgrind.  It is not required for normal
-// operation, or for the google perftools heap-checker.  It must only
-// be called when the process is about to exit, and all threads that
-// might access flags are quiescent.  Referencing flags after this is
-// called will have unexpected consequences.  This is not safe to run
-// when multiple threads might be running: the function is
-// thread-hostile.
-extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
-
-
-// --------------------------------------------------------------------
-// Now come the command line flag declaration/definition macros that
-// will actually be used.  They're kind of hairy.  A major reason
-// for this is initialization: we want people to be able to access
-// variables in global constructors and have that not crash, even if
-// their global constructor runs before the global constructor here.
-// (Obviously, we can't guarantee the flags will have the correct
-// default value in that case, but at least accessing them is safe.)
-// The only way to do that is have flags point to a static buffer.
-// So we make one, using a union to ensure proper alignment, and
-// then use placement-new to actually set up the flag with the
-// correct default value.  In the same vein, we have to worry about
-// flag access in global destructors, so FlagRegisterer has to be
-// careful never to destroy the flag-values it constructs.
-//
-// Note that when we define a flag variable FLAGS_<name>, we also
-// preemptively define a junk variable, FLAGS_no<name>.  This is to
-// cause a link-time error if someone tries to define 2 flags with
-// names like "logging" and "nologging".  We do this because a bool
-// flag FLAG can be set from the command line to true with a "-FLAG"
-// argument, and to false with a "-noFLAG" argument, and so this can
-// potentially avert confusion.
-//
-// We also put flags into their own namespace.  It is purposefully
-// named in an opaque way that people should have trouble typing
-// directly.  The idea is that DEFINE puts the flag in the weird
-// namespace, and DECLARE imports the flag from there into the current
-// namespace.  The net result is to force people to use DECLARE to get
-// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
-// or some such instead.  We want this so we can put extra
-// functionality (like sanity-checking) in DECLARE if we want, and
-// make sure it is picked up everywhere.
-//
-// We also put the type of the variable in the namespace, so that
-// people can't DECLARE_int32 something that they DEFINE_bool'd
-// elsewhere.
-
-class GFLAGS_DLL_DECL FlagRegisterer {
- public:
-  FlagRegisterer(const char* name, const char* type,
-                 const char* help, const char* filename,
-                 void* current_storage, void* defvalue_storage);
-};
-
-// If your application #defines STRIP_FLAG_HELP to a non-zero value
-// before #including this file, we remove the help message from the
-// binary file. This can reduce the size of the resulting binary
-// somewhat, and may also be useful for security reasons.
-
-extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
-
-
-} // namespace GFLAGS_NAMESPACE
-
-
-#ifndef SWIG  // In swig, ignore the main flag declarations
-
-#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
-// Need this construct to avoid the 'defined but not used' warning.
-#define MAYBE_STRIPPED_HELP(txt) \
-   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
-#else
-#define MAYBE_STRIPPED_HELP(txt) txt
-#endif
-
-// Each command-line flag has two variables associated with it: one
-// with the current value, and one with the default value.  However,
-// we have a third variable, which is where value is assigned; it's a
-// constant.  This guarantees that FLAG_##value is initialized at
-// static initialization time (e.g. before program-start) rather than
-// than global construction time (which is after program-start but
-// before main), at least when 'value' is a compile-time constant.  We
-// use a small trick for the "default value" variable, and call it
-// FLAGS_no<name>.  This serves the second purpose of assuring a
-// compile error if someone tries to define a flag named no<name>
-// which is illegal (--foo and --nofoo both affect the "foo" flag).
-#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
-  namespace fL##shorttype {                                             \
-    static const type FLAGS_nono##name = value;                         \
-    /* We always want to export defined variables, dll or no */         \
-    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
-    type FLAGS_no##name = FLAGS_nono##name;                             \
-    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
-      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
-      &FLAGS_##name, &FLAGS_no##name);                                  \
-  }                                                                     \
-  using fL##shorttype::FLAGS_##name
-
-// For DEFINE_bool, we want to do the extra check that the passed-in
-// value is actually a bool, and not a string or something that can be
-// coerced to a bool.  These declarations (no definition needed!) will
-// help us do that, and never evaluate From, which is important.
-// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
-// that the compiler have different sizes for bool & double. Since
-// this is not guaranteed by the standard, we check it with a
-// COMPILE_ASSERT.
-namespace fLB {
-struct CompileAssert {};
-typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
-                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
-template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
-GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
-}  // namespace fLB
-
-// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
-// are in a separate include, gflags_declare.h, for reducing
-// the physical transitive size for DECLARE use.
-#define DEFINE_bool(name, val, txt)                                     \
-  namespace fLB {                                                       \
-    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
-            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
-  }                                                                     \
-  DEFINE_VARIABLE(bool, B, name, val, txt)
-
-#define DEFINE_int32(name, val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
-                   name, val, txt)
-
-#define DEFINE_int64(name, val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
-                   name, val, txt)
-
-#define DEFINE_uint64(name,val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
-                   name, val, txt)
-
-#define DEFINE_double(name, val, txt) \
-   DEFINE_VARIABLE(double, D, name, val, txt)
-
-// Strings are trickier, because they're not a POD, so we can't
-// construct them at static-initialization time (instead they get
-// constructed at global-constructor time, which is much later).  To
-// try to avoid crashes in that case, we use a char buffer to store
-// the string, which we can static-initialize, and then placement-new
-// into it later.  It's not perfect, but the best we can do.
-
-namespace fLS {
-
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           const char *value) {
-  return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           const clstring &value) {
-  return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           int value);
-}  // namespace fLS
-
-// We need to define a var named FLAGS_no##name so people don't define
-// --string and --nostring.  And we need a temporary place to put val
-// so we don't have to evaluate it twice.  Two great needs that go
-// great together!
-// The weird 'using' + 'extern' inside the fLS namespace is to work around
-// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
-//    http://code.google.com/p/google-gflags/issues/detail?id=20
-#define DEFINE_string(name, val, txt)                                       \
-  namespace fLS {                                                           \
-    using ::fLS::clstring;                                                  \
-    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
-    clstring* const FLAGS_no##name = ::fLS::                                \
-                                   dont_pass0toDEFINE_string(s_##name[0].s, \
-                                                             val);          \
-    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
-        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
-        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
-    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
-    using fLS::FLAGS_##name;                                                \
-    clstring& FLAGS_##name = *FLAGS_no##name;                               \
-  }                                                                         \
-  using fLS::FLAGS_##name
-
-#endif  // SWIG
-
-
-// Import gflags library symbols into alternative/deprecated namespace(s)
-#include "gflags_gflags.h"
-
-
-#endif  // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
deleted file mode 100644
index f951c1e0..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ---
-
-//
-// Implement helpful bash-style command line flag completions
-//
-// ** Functional API:
-// HandleCommandLineCompletions() should be called early during
-// program startup, but after command line flag code has been
-// initialized, such as the beginning of HandleCommandLineHelpFlags().
-// It checks the value of the flag --tab_completion_word.  If this
-// flag is empty, nothing happens here.  If it contains a string,
-// however, then HandleCommandLineCompletions() will hijack the
-// process, attempting to identify the intention behind this
-// completion.  Regardless of the outcome of this deduction, the
-// process will be terminated, similar to --helpshort flag
-// handling.
-//
-// ** Overview of Bash completions:
-// Bash can be told to programatically determine completions for the
-// current 'cursor word'.  It does this by (in this case) invoking a
-// command with some additional arguments identifying the command
-// being executed, the word being completed, and the previous word
-// (if any).  Bash then expects a sequence of output lines to be
-// printed to stdout.  If these lines all contain a common prefix
-// longer than the cursor word, bash will replace the cursor word
-// with that common prefix, and display nothing.  If there isn't such
-// a common prefix, bash will display the lines in pages using 'more'.
-//
-// ** Strategy taken for command line completions:
-// If we can deduce either the exact flag intended, or a common flag
-// prefix, we'll output exactly that.  Otherwise, if information
-// must be displayed to the user, we'll take the opportunity to add
-// some helpful information beyond just the flag name (specifically,
-// we'll include the default flag value and as much of the flag's
-// description as can fit on a single terminal line width, as specified
-// by the flag --tab_completion_columns).  Furthermore, we'll try to
-// make bash order the output such that the most useful or relevent
-// flags are the most likely to be shown at the top.
-//
-// ** Additional features:
-// To assist in finding that one really useful flag, substring matching
-// was implemented.  Before pressing a <TAB> to get completion for the
-// current word, you can append one or more '?' to the flag to do
-// substring matching.  Here's the semantics:
-//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
-//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
-//   --foo??<TAB>   Same as prior case, but also search in module
-//                  definition path for 'foo'
-//   --foo???<TAB>  Same as prior case, but also search in flag
-//                  descriptions for 'foo'
-// Finally, we'll trim the output to a relatively small number of
-// flags to keep bash quiet about the verbosity of output.  If one
-// really wanted to see all possible matches, appending a '+' to the
-// search word will force the exhaustive list of matches to be printed.
-//
-// ** How to have bash accept completions from a binary:
-// Bash requires that it be informed about each command that programmatic
-// completion should be enabled for.  Example addition to a .bashrc
-// file would be (your path to gflags_completions.sh file may differ):
-
-/*
-$ complete -o bashdefault -o default -o nospace -C                            \
- '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
-  time  env  binary_name  another_binary  [...]
-*/
-
-// This would allow the following to work:
-//   $ /path/to/binary_name --vmodule<TAB>
-// Or:
-//   $ ./bin/path/another_binary --gfs_u<TAB>
-// (etc)
-//
-// Sadly, it appears that bash gives no easy way to force this behavior for
-// all commands.  That's where the "time" in the above example comes in.
-// If you haven't specifically added a command to the list of completion
-// supported commands, you can still get completions by prefixing the
-// entire command with "env".
-//   $ env /some/brand/new/binary --vmod<TAB>
-// Assuming that "binary" is a newly compiled binary, this should still
-// produce the expected completion output.
-
-
-#ifndef GFLAGS_COMPLETIONS_H_
-#define GFLAGS_COMPLETIONS_H_
-
-namespace google {
-
-extern void HandleCommandLineCompletions(void);
-
-}
-
-#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
deleted file mode 100644
index 935a20e7..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 1999, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-//
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// command line flag.
-
-#ifndef GFLAGS_DECLARE_H_
-#define GFLAGS_DECLARE_H_
-
-
-// ---------------------------------------------------------------------------
-// Namespace of gflags library symbols.
-#define GFLAGS_NAMESPACE google
-
-// ---------------------------------------------------------------------------
-// Windows DLL import/export.
-
-// We always want to import the symbols of the gflags library
-#ifndef GFLAGS_DLL_DECL
-#  if 0 && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL __declspec(dllimport)
-#  else
-#    define GFLAGS_DLL_DECL
-#  endif
-#endif
-
-// We always want to import variables declared in user code
-#ifndef GFLAGS_DLL_DECLARE_FLAG
-#  ifdef _MSC_VER
-#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
-#  else
-#    define GFLAGS_DLL_DECLARE_FLAG
-#  endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Flag types
-#include <string>
-#if 1
-#  include <stdint.h>                   // the normal place uint32_t is defined
-#elif 1
-#  include <sys/types.h>                // the normal place u_int32_t is defined
-#elif 1
-#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
-#endif
-
-namespace GFLAGS_NAMESPACE {
-
-#if 1 // C99
-typedef int32_t          int32;
-typedef uint32_t         uint32;
-typedef int64_t          int64;
-typedef uint64_t         uint64;
-#elif 0 // BSD
-typedef int32_t          int32;
-typedef u_int32_t        uint32;
-typedef int64_t          int64;
-typedef u_int64_t        uint64;
-#elif 0 // Windows
-typedef __int32          int32;
-typedef unsigned __int32 uint32;
-typedef __int64          int64;
-typedef unsigned __int64 uint64;
-#else
-#  error Do not know how to define a 32-bit integer quantity on your system
-#endif
-
-} // namespace GFLAGS_NAMESPACE
-
-
-namespace fLS {
-
-// The meaning of "string" might be different between now and when the
-// macros below get invoked (e.g., if someone is experimenting with
-// other string implementations that get defined after this file is
-// included).  Save the current meaning now and use it in the macros.
-typedef std::string clstring;
-
-} // namespace fLS
-
-
-#define DECLARE_VARIABLE(type, shorttype, name) \
-  /* We always want to import declared variables, dll or no */ \
-  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
-  using fL##shorttype::FLAGS_##name
-
-#define DECLARE_bool(name) \
-  DECLARE_VARIABLE(bool, B, name)
-
-#define DECLARE_int32(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
-
-#define DECLARE_int64(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
-
-#define DECLARE_uint64(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
-
-#define DECLARE_double(name) \
-  DECLARE_VARIABLE(double, D, name)
-
-#define DECLARE_string(name) \
-  /* We always want to import declared variables, dll or no */ \
-  namespace fLS { \
-  using ::fLS::clstring; \
-  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
-  } \
-  using fLS::FLAGS_##name
-
-
-#endif  // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
deleted file mode 100644
index 0c17825d..00000000
--- a/files/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2014, Andreas Schuh
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// -----------------------------------------------------------------------------
-// Imports the gflags library symbols into an alternative/deprecated namespace.
-
-#ifndef GFLAGS_GFLAGS_H_
-#  error The internal header gflags_gflags.h may only be included by gflags.h
-#endif
-
-#ifndef GFLAGS_NS_GFLAGS_H_
-#define GFLAGS_NS_GFLAGS_H_
-
-
-namespace gflags {
-
-
-using GFLAGS_NAMESPACE::int32;
-using GFLAGS_NAMESPACE::uint32;
-using GFLAGS_NAMESPACE::int64;
-using GFLAGS_NAMESPACE::uint64;
-
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::CommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetAllFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
-using GFLAGS_NAMESPACE::DescribeOneFlag;
-using GFLAGS_NAMESPACE::SetArgv;
-using GFLAGS_NAMESPACE::GetArgvs;
-using GFLAGS_NAMESPACE::GetArgv;
-using GFLAGS_NAMESPACE::GetArgv0;
-using GFLAGS_NAMESPACE::GetArgvSum;
-using GFLAGS_NAMESPACE::ProgramInvocationName;
-using GFLAGS_NAMESPACE::ProgramInvocationShortName;
-using GFLAGS_NAMESPACE::ProgramUsage;
-using GFLAGS_NAMESPACE::VersionString;
-using GFLAGS_NAMESPACE::GetCommandLineOption;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
-using GFLAGS_NAMESPACE::FlagSettingMode;
-using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
-using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
-using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
-using GFLAGS_NAMESPACE::SetCommandLineOption;
-using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
-using GFLAGS_NAMESPACE::FlagSaver;
-using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
-using GFLAGS_NAMESPACE::ReadFlagsFromString;
-using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
-using GFLAGS_NAMESPACE::ReadFromFlagsFile;
-using GFLAGS_NAMESPACE::BoolFromEnv;
-using GFLAGS_NAMESPACE::Int32FromEnv;
-using GFLAGS_NAMESPACE::Int64FromEnv;
-using GFLAGS_NAMESPACE::Uint64FromEnv;
-using GFLAGS_NAMESPACE::DoubleFromEnv;
-using GFLAGS_NAMESPACE::StringFromEnv;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-using GFLAGS_NAMESPACE::SetVersionString;
-using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
-using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
-using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
-using GFLAGS_NAMESPACE::FlagRegisterer;
-
-#ifndef SWIG
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-#endif
-
-
-} // namespace gflags
-
-
-#endif  // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/posix/include/private/config.h b/files/third_party/gflags/gen/posix/include/private/config.h
deleted file mode 100644
index 592d61c4..00000000
--- a/files/third_party/gflags/gen/posix/include/private/config.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Generated from config.h.in during build configuration using CMake. */
-
-// Note: This header file is only used internally. It is not part of public interface!
-
-// ---------------------------------------------------------------------------
-// System checks
-
-// Define if you build this library for a MS Windows OS.
-/* #undef OS_WINDOWS */
-
-// Define if you have the <stdint.h> header file.
-#define HAVE_STDINT_H
-
-// Define if you have the <sys/types.h> header file.
-#define HAVE_SYS_TYPES_H
-
-// Define if you have the <inttypes.h> header file.
-#define HAVE_INTTYPES_H
-
-// Define if you have the <sys/stat.h> header file.
-#define HAVE_SYS_STAT_H
-
-// Define if you have the <unistd.h> header file.
-#define HAVE_UNISTD_H
-
-// Define if you have the <fnmatch.h> header file.
-#define HAVE_FNMATCH_H
-
-// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
-/* #undef HAVE_SHLWAPI_H */
-
-// Define if you have the strtoll function.
-#define HAVE_STRTOLL
-
-// Define if you have the strtoq function.
-/* #undef HAVE_STRTOQ */
-
-// Define if you have the <pthread.h> header file.
-#define HAVE_PTHREAD
-
-// Define if your pthread library defines the type pthread_rwlock_t
-#define HAVE_RWLOCK
-
-// gcc requires this to get PRId64, etc.
-#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
-#  define __STDC_FORMAT_MACROS 1
-#endif
-
-// ---------------------------------------------------------------------------
-// Package information
-
-// Name of package.
-#define PACKAGE gflags
-
-// Define to the full name of this package.
-#define PACKAGE_NAME gflags
-
-// Define to the full name and version of this package.
-#define PACKAGE_STRING gflags 2.2.0
-
-// Define to the one symbol short name of this package.
-#define PACKAGE_TARNAME gflags-2.2.0
-
-// Define to the version of this package.
-#define PACKAGE_VERSION 2.2.0
-
-// Version number of package.
-#define VERSION PACKAGE_VERSION
-
-// Define to the address where bug reports for this package should be sent.
-#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
-
-// ---------------------------------------------------------------------------
-// Path separator
-#ifndef PATH_SEPARATOR
-#  ifdef OS_WINDOWS
-#    define PATH_SEPARATOR  '\\'
-#  else
-#    define PATH_SEPARATOR  '/'
-#  endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Windows
-
-// Whether gflags library is a DLL.
-#ifndef GFLAGS_IS_A_DLL
-#  define GFLAGS_IS_A_DLL 0
-#endif
-
-// Always export symbols when compiling a shared library as this file is only
-// included by internal modules when building the gflags library itself.
-// The gflags_declare.h header file will set it to import these symbols otherwise.
-#ifndef GFLAGS_DLL_DECL
-#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL __declspec(dllexport)
-#  else
-#    define GFLAGS_DLL_DECL
-#  endif
-#endif
-// Flags defined by the gflags library itself must be exported
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
-#endif
-
-#ifdef OS_WINDOWS
-// The unittests import the symbols of the shared gflags library
-#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
-#  endif
-#  include "windows_port.h"
-#endif
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags.h
deleted file mode 100644
index 357eec6b..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags.h
+++ /dev/null
@@ -1,573 +0,0 @@
-// Copyright (c) 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// or defines a command line flag or wants to parse command line flags
-// or print a program usage message (which will include information about
-// flags).  Executive summary, in the form of an example foo.cc file:
-//
-//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
-//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
-//
-//    DEFINE_int32(end, 1000, "The last record to read");
-//
-//    DEFINE_string(filename, "my_file.txt", "The file to read");
-//    // Crash if the specified file does not exist.
-//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
-//                                              &ValidateIsFile);
-//
-//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
-//
-//    void MyFunc() {
-//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
-//    }
-//
-//    Then, at the command-line:
-//       ./foo --noverbose --start=5 --end=100
-//
-// For more details, see
-//    doc/gflags.html
-//
-// --- A note about thread-safety:
-//
-// We describe many functions in this routine as being thread-hostile,
-// thread-compatible, or thread-safe.  Here are the meanings we use:
-//
-// thread-safe: it is safe for multiple threads to call this routine
-//   (or, when referring to a class, methods of this class)
-//   concurrently.
-// thread-hostile: it is not safe for multiple threads to call this
-//   routine (or methods of this class) concurrently.  In gflags,
-//   most thread-hostile routines are intended to be called early in,
-//   or even before, main() -- that is, before threads are spawned.
-// thread-compatible: it is safe for multiple threads to read from
-//   this variable (when applied to variables), or to call const
-//   methods of this class (when applied to classes), as long as no
-//   other thread is writing to the variable or calling non-const
-//   methods of this class.
-
-#ifndef GFLAGS_GFLAGS_H_
-#define GFLAGS_GFLAGS_H_
-
-#include <string>
-#include <vector>
-
-#include "gflags_declare.h" // IWYU pragma: export
-
-
-// We always want to export variables defined in user code
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-#  ifdef _MSC_VER
-#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
-#  else
-#    define GFLAGS_DLL_DEFINE_FLAG
-#  endif
-#endif
-
-
-namespace GFLAGS_NAMESPACE {
-
-
-// --------------------------------------------------------------------
-// To actually define a flag in a file, use DEFINE_bool,
-// DEFINE_string, etc. at the bottom of this file.  You may also find
-// it useful to register a validator with the flag.  This ensures that
-// when the flag is parsed from the commandline, or is later set via
-// SetCommandLineOption, we call the validation function. It is _not_
-// called when you assign the value to the flag directly using the = operator.
-//
-// The validation function should return true if the flag value is valid, and
-// false otherwise. If the function returns false for the new setting of the
-// flag, the flag will retain its current value. If it returns false for the
-// default value, ParseCommandLineFlags() will die.
-//
-// This function is safe to call at global construct time (as in the
-// example below).
-//
-// Example use:
-//    static bool ValidatePort(const char* flagname, int32 value) {
-//       if (value > 0 && value < 32768)   // value is ok
-//         return true;
-//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
-//       return false;
-//    }
-//    DEFINE_int32(port, 0, "What port to listen on");
-//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
-
-// Returns true if successfully registered, false if not (because the
-// first argument doesn't point to a command-line flag, or because a
-// validator is already registered for this flag).
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
-extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
-
-// Convenience macro for the registration of a flag validator
-#define DEFINE_validator(name, validator) \
-    static const bool name##_validator_registered = \
-            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
-
-
-// --------------------------------------------------------------------
-// These methods are the best way to get access to info about the
-// list of commandline flags.  Note that these routines are pretty slow.
-//   GetAllFlags: mostly-complete info about the list, sorted by file.
-//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
-//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
-//
-// In addition to accessing flags, you can also access argv[0] (the program
-// name) and argv (the entire commandline), which we sock away a copy of.
-// These variables are static, so you should only set them once.
-//
-// No need to export this data only structure from DLL, avoiding VS warning 4251.
-struct CommandLineFlagInfo {
-  std::string name;            // the name of the flag
-  std::string type;            // the type of the flag: int32, etc
-  std::string description;     // the "help text" associated with the flag
-  std::string current_value;   // the current value, as a string
-  std::string default_value;   // the default value, as a string
-  std::string filename;        // 'cleaned' version of filename holding the flag
-  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
-  bool is_default;             // true if the flag has the default value and
-                               // has not been set explicitly from the cmdline
-                               // or via SetCommandLineOption
-  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
-};
-
-// Using this inside of a validator is a recipe for a deadlock.
-// TODO(user) Fix locking when validators are running, to make it safe to
-// call validators during ParseAllFlags.
-// Also make sure then to uncomment the corresponding unit test in
-// gflags_unittest.sh
-extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
-// These two are actually defined in gflags_reporting.cc.
-extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
-extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
-
-// Create a descriptive string for a flag.
-// Goes to some trouble to make pretty line breaks.
-extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
-
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
-
-// The following functions are thread-safe as long as SetArgv() is
-// only called before any threads start.
-extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
-extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
-extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
-extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
-extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
-extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
-
-// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
-
-// VersionString() is thread-safe as long as SetVersionString() is only
-// called before any threads start.
-extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
-
-
-
-// --------------------------------------------------------------------
-// Normally you access commandline flags by just saying "if (FLAGS_foo)"
-// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
-// commonly, via the DEFINE_foo macro).  But if you need a bit more
-// control, we have programmatic ways to get/set the flags as well.
-// These programmatic ways to access flags are thread-safe, but direct
-// access is only thread-compatible.
-
-// Return true iff the flagname was found.
-// OUTPUT is set to the flag's value, or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
-
-// Return true iff the flagname was found. OUTPUT is set to the flag's
-// CommandLineFlagInfo or unchanged if we return false.
-extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
-
-// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
-// Example usage, to check if a flag's value is currently the default value:
-//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
-extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
-
-enum GFLAGS_DLL_DECL FlagSettingMode {
-  // update the flag's value (can call this multiple times).
-  SET_FLAGS_VALUE,
-  // update the flag's value, but *only if* it has not yet been updated
-  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
-  SET_FLAG_IF_DEFAULT,
-  // set the flag's default value to this.  If the flag has not yet updated
-  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
-  // change the flag's current value to the new default value as well.
-  SET_FLAGS_DEFAULT
-};
-
-// Set a particular flag ("command line option").  Returns a string
-// describing the new value that the option has been set to.  The
-// return value API is not well-specified, so basically just depend on
-// it to be empty if the setting failed for some reason -- the name is
-// not a valid flag name, or the value is not a valid value -- and
-// non-empty else.
-
-// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
-extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
-extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
-
-
-// --------------------------------------------------------------------
-// Saves the states (value, default value, whether the user has set
-// the flag, registered validators, etc) of all flags, and restores
-// them when the FlagSaver is destroyed.  This is very useful in
-// tests, say, when you want to let your tests change the flags, but
-// make sure that they get reverted to the original states when your
-// test is complete.
-//
-// Example usage:
-//   void TestFoo() {
-//     FlagSaver s1;
-//     FLAG_foo = false;
-//     FLAG_bar = "some value";
-//
-//     // test happens here.  You can return at any time
-//     // without worrying about restoring the FLAG values.
-//   }
-//
-// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
-// the work is done in the constructor and destructor, so in the standard
-// usage example above, the compiler would complain that it's an
-// unused variable.
-//
-// This class is thread-safe.  However, its destructor writes to
-// exactly the set of flags that have changed value during its
-// lifetime, so concurrent _direct_ access to those flags
-// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
-
-class GFLAGS_DLL_DECL FlagSaver {
- public:
-  FlagSaver();
-  ~FlagSaver();
-
- private:
-  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
-
-  FlagSaver(const FlagSaver&);  // no copying!
-  void operator=(const FlagSaver&);
-};
-
-// --------------------------------------------------------------------
-// Some deprecated or hopefully-soon-to-be-deprecated functions.
-
-// This is often used for logging.  TODO(csilvers): figure out a better way
-extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
-// Usually where this is used, a FlagSaver should be used instead.
-extern GFLAGS_DLL_DECL
-bool ReadFlagsFromString(const std::string& flagfilecontents,
-                         const char* prog_name,
-                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
-
-// These let you manually implement --flagfile functionality.
-// DEPRECATED.
-extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
-extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
-
-
-// --------------------------------------------------------------------
-// Useful routines for initializing flags from the environment.
-// In each case, if 'varname' does not exist in the environment
-// return defval.  If 'varname' does exist but is not valid
-// (e.g., not a number for an int32 flag), abort with an error.
-// Otherwise, return the value.  NOTE: for booleans, for true use
-// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
-
-extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
-extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
-extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
-extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
-extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
-extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
-
-
-// --------------------------------------------------------------------
-// The next two functions parse gflags from main():
-
-// Set the "usage" message for this program.  For example:
-//   string usage("This program does nothing.  Sample usage:\n");
-//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
-//   SetUsageMessage(usage);
-// Do not include commandline flags in the usage: we do that for you!
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
-
-// Sets the version string, which is emitted with --version.
-// For instance: SetVersionString("1.3");
-// Thread-hostile; meant to be called before any threads are spawned.
-extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
-
-
-// Looks for flags in argv and parses them.  Rearranges argv to put
-// flags first, or removes them entirely if remove_flags is true.
-// If a flag is defined more than once in the command line or flag
-// file, the last definition is used.  Returns the index (into argv)
-// of the first non-flag argument.
-// See top-of-file for more details on this function.
-#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
-#endif
-
-
-// Calls to ParseCommandLineNonHelpFlags and then to
-// HandleCommandLineHelpFlags can be used instead of a call to
-// ParseCommandLineFlags during initialization, in order to allow for
-// changing default values for some FLAGS (via
-// e.g. SetCommandLineOptionWithMode calls) between the time of
-// command line parsing and the time of dumping help information for
-// the flags as a result of command line parsing.  If a flag is
-// defined more than once in the command line or flag file, the last
-// definition is used.  Returns the index (into argv) of the first
-// non-flag argument.  (If remove_flags is true, will always return 1.)
-extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
-
-// This is actually defined in gflags_reporting.cc.
-// This function is misnamed (it also handles --version, etc.), but
-// it's too late to change that now. :-(
-extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
-
-// Allow command line reparsing.  Disables the error normally
-// generated when an unknown flag is found, since it may be found in a
-// later parse.  Thread-hostile; meant to be called before any threads
-// are spawned.
-extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
-
-// Reparse the flags that have not yet been recognized.  Only flags
-// registered since the last parse will be recognized.  Any flag value
-// must be provided as part of the argument using "=", not as a
-// separate command line argument that follows the flag argument.
-// Intended for handling flags from dynamically loaded libraries,
-// since their flags are not registered until they are loaded.
-extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
-
-// Clean up memory allocated by flags.  This is only needed to reduce
-// the quantity of "potentially leaked" reports emitted by memory
-// debugging tools such as valgrind.  It is not required for normal
-// operation, or for the google perftools heap-checker.  It must only
-// be called when the process is about to exit, and all threads that
-// might access flags are quiescent.  Referencing flags after this is
-// called will have unexpected consequences.  This is not safe to run
-// when multiple threads might be running: the function is
-// thread-hostile.
-extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
-
-
-// --------------------------------------------------------------------
-// Now come the command line flag declaration/definition macros that
-// will actually be used.  They're kind of hairy.  A major reason
-// for this is initialization: we want people to be able to access
-// variables in global constructors and have that not crash, even if
-// their global constructor runs before the global constructor here.
-// (Obviously, we can't guarantee the flags will have the correct
-// default value in that case, but at least accessing them is safe.)
-// The only way to do that is have flags point to a static buffer.
-// So we make one, using a union to ensure proper alignment, and
-// then use placement-new to actually set up the flag with the
-// correct default value.  In the same vein, we have to worry about
-// flag access in global destructors, so FlagRegisterer has to be
-// careful never to destroy the flag-values it constructs.
-//
-// Note that when we define a flag variable FLAGS_<name>, we also
-// preemptively define a junk variable, FLAGS_no<name>.  This is to
-// cause a link-time error if someone tries to define 2 flags with
-// names like "logging" and "nologging".  We do this because a bool
-// flag FLAG can be set from the command line to true with a "-FLAG"
-// argument, and to false with a "-noFLAG" argument, and so this can
-// potentially avert confusion.
-//
-// We also put flags into their own namespace.  It is purposefully
-// named in an opaque way that people should have trouble typing
-// directly.  The idea is that DEFINE puts the flag in the weird
-// namespace, and DECLARE imports the flag from there into the current
-// namespace.  The net result is to force people to use DECLARE to get
-// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
-// or some such instead.  We want this so we can put extra
-// functionality (like sanity-checking) in DECLARE if we want, and
-// make sure it is picked up everywhere.
-//
-// We also put the type of the variable in the namespace, so that
-// people can't DECLARE_int32 something that they DEFINE_bool'd
-// elsewhere.
-
-class GFLAGS_DLL_DECL FlagRegisterer {
- public:
-  FlagRegisterer(const char* name, const char* type,
-                 const char* help, const char* filename,
-                 void* current_storage, void* defvalue_storage);
-};
-
-// If your application #defines STRIP_FLAG_HELP to a non-zero value
-// before #including this file, we remove the help message from the
-// binary file. This can reduce the size of the resulting binary
-// somewhat, and may also be useful for security reasons.
-
-extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
-
-
-} // namespace GFLAGS_NAMESPACE
-
-
-#ifndef SWIG  // In swig, ignore the main flag declarations
-
-#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
-// Need this construct to avoid the 'defined but not used' warning.
-#define MAYBE_STRIPPED_HELP(txt) \
-   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
-#else
-#define MAYBE_STRIPPED_HELP(txt) txt
-#endif
-
-// Each command-line flag has two variables associated with it: one
-// with the current value, and one with the default value.  However,
-// we have a third variable, which is where value is assigned; it's a
-// constant.  This guarantees that FLAG_##value is initialized at
-// static initialization time (e.g. before program-start) rather than
-// than global construction time (which is after program-start but
-// before main), at least when 'value' is a compile-time constant.  We
-// use a small trick for the "default value" variable, and call it
-// FLAGS_no<name>.  This serves the second purpose of assuring a
-// compile error if someone tries to define a flag named no<name>
-// which is illegal (--foo and --nofoo both affect the "foo" flag).
-#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
-  namespace fL##shorttype {                                             \
-    static const type FLAGS_nono##name = value;                         \
-    /* We always want to export defined variables, dll or no */         \
-    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
-    type FLAGS_no##name = FLAGS_nono##name;                             \
-    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
-      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
-      &FLAGS_##name, &FLAGS_no##name);                                  \
-  }                                                                     \
-  using fL##shorttype::FLAGS_##name
-
-// For DEFINE_bool, we want to do the extra check that the passed-in
-// value is actually a bool, and not a string or something that can be
-// coerced to a bool.  These declarations (no definition needed!) will
-// help us do that, and never evaluate From, which is important.
-// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
-// that the compiler have different sizes for bool & double. Since
-// this is not guaranteed by the standard, we check it with a
-// COMPILE_ASSERT.
-namespace fLB {
-struct CompileAssert {};
-typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
-                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
-template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
-GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
-}  // namespace fLB
-
-// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
-// are in a separate include, gflags_declare.h, for reducing
-// the physical transitive size for DECLARE use.
-#define DEFINE_bool(name, val, txt)                                     \
-  namespace fLB {                                                       \
-    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
-            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
-  }                                                                     \
-  DEFINE_VARIABLE(bool, B, name, val, txt)
-
-#define DEFINE_int32(name, val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
-                   name, val, txt)
-
-#define DEFINE_int64(name, val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
-                   name, val, txt)
-
-#define DEFINE_uint64(name,val, txt) \
-   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
-                   name, val, txt)
-
-#define DEFINE_double(name, val, txt) \
-   DEFINE_VARIABLE(double, D, name, val, txt)
-
-// Strings are trickier, because they're not a POD, so we can't
-// construct them at static-initialization time (instead they get
-// constructed at global-constructor time, which is much later).  To
-// try to avoid crashes in that case, we use a char buffer to store
-// the string, which we can static-initialize, and then placement-new
-// into it later.  It's not perfect, but the best we can do.
-
-namespace fLS {
-
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           const char *value) {
-  return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           const clstring &value) {
-  return new(stringspot) clstring(value);
-}
-inline clstring* dont_pass0toDEFINE_string(char *stringspot,
-                                           int value);
-}  // namespace fLS
-
-// We need to define a var named FLAGS_no##name so people don't define
-// --string and --nostring.  And we need a temporary place to put val
-// so we don't have to evaluate it twice.  Two great needs that go
-// great together!
-// The weird 'using' + 'extern' inside the fLS namespace is to work around
-// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
-//    http://code.google.com/p/google-gflags/issues/detail?id=20
-#define DEFINE_string(name, val, txt)                                       \
-  namespace fLS {                                                           \
-    using ::fLS::clstring;                                                  \
-    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
-    clstring* const FLAGS_no##name = ::fLS::                                \
-                                   dont_pass0toDEFINE_string(s_##name[0].s, \
-                                                             val);          \
-    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
-        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
-        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
-    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
-    using fLS::FLAGS_##name;                                                \
-    clstring& FLAGS_##name = *FLAGS_no##name;                               \
-  }                                                                         \
-  using fLS::FLAGS_##name
-
-#endif  // SWIG
-
-
-// Import gflags library symbols into alternative/deprecated namespace(s)
-#include "gflags_gflags.h"
-
-
-#endif  // GFLAGS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h b/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
deleted file mode 100644
index f951c1e0..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_completions.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ---
-
-//
-// Implement helpful bash-style command line flag completions
-//
-// ** Functional API:
-// HandleCommandLineCompletions() should be called early during
-// program startup, but after command line flag code has been
-// initialized, such as the beginning of HandleCommandLineHelpFlags().
-// It checks the value of the flag --tab_completion_word.  If this
-// flag is empty, nothing happens here.  If it contains a string,
-// however, then HandleCommandLineCompletions() will hijack the
-// process, attempting to identify the intention behind this
-// completion.  Regardless of the outcome of this deduction, the
-// process will be terminated, similar to --helpshort flag
-// handling.
-//
-// ** Overview of Bash completions:
-// Bash can be told to programatically determine completions for the
-// current 'cursor word'.  It does this by (in this case) invoking a
-// command with some additional arguments identifying the command
-// being executed, the word being completed, and the previous word
-// (if any).  Bash then expects a sequence of output lines to be
-// printed to stdout.  If these lines all contain a common prefix
-// longer than the cursor word, bash will replace the cursor word
-// with that common prefix, and display nothing.  If there isn't such
-// a common prefix, bash will display the lines in pages using 'more'.
-//
-// ** Strategy taken for command line completions:
-// If we can deduce either the exact flag intended, or a common flag
-// prefix, we'll output exactly that.  Otherwise, if information
-// must be displayed to the user, we'll take the opportunity to add
-// some helpful information beyond just the flag name (specifically,
-// we'll include the default flag value and as much of the flag's
-// description as can fit on a single terminal line width, as specified
-// by the flag --tab_completion_columns).  Furthermore, we'll try to
-// make bash order the output such that the most useful or relevent
-// flags are the most likely to be shown at the top.
-//
-// ** Additional features:
-// To assist in finding that one really useful flag, substring matching
-// was implemented.  Before pressing a <TAB> to get completion for the
-// current word, you can append one or more '?' to the flag to do
-// substring matching.  Here's the semantics:
-//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
-//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
-//   --foo??<TAB>   Same as prior case, but also search in module
-//                  definition path for 'foo'
-//   --foo???<TAB>  Same as prior case, but also search in flag
-//                  descriptions for 'foo'
-// Finally, we'll trim the output to a relatively small number of
-// flags to keep bash quiet about the verbosity of output.  If one
-// really wanted to see all possible matches, appending a '+' to the
-// search word will force the exhaustive list of matches to be printed.
-//
-// ** How to have bash accept completions from a binary:
-// Bash requires that it be informed about each command that programmatic
-// completion should be enabled for.  Example addition to a .bashrc
-// file would be (your path to gflags_completions.sh file may differ):
-
-/*
-$ complete -o bashdefault -o default -o nospace -C                            \
- '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
-  time  env  binary_name  another_binary  [...]
-*/
-
-// This would allow the following to work:
-//   $ /path/to/binary_name --vmodule<TAB>
-// Or:
-//   $ ./bin/path/another_binary --gfs_u<TAB>
-// (etc)
-//
-// Sadly, it appears that bash gives no easy way to force this behavior for
-// all commands.  That's where the "time" in the above example comes in.
-// If you haven't specifically added a command to the list of completion
-// supported commands, you can still get completions by prefixing the
-// entire command with "env".
-//   $ env /some/brand/new/binary --vmod<TAB>
-// Assuming that "binary" is a newly compiled binary, this should still
-// produce the expected completion output.
-
-
-#ifndef GFLAGS_COMPLETIONS_H_
-#define GFLAGS_COMPLETIONS_H_
-
-namespace google {
-
-extern void HandleCommandLineCompletions(void);
-
-}
-
-#endif  // GFLAGS_COMPLETIONS_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h b/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
deleted file mode 100644
index fbc8466f..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_declare.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 1999, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// ---
-//
-// Revamped and reorganized by Craig Silverstein
-//
-// This is the file that should be included by any file which declares
-// command line flag.
-
-#ifndef GFLAGS_DECLARE_H_
-#define GFLAGS_DECLARE_H_
-
-
-// ---------------------------------------------------------------------------
-// Namespace of gflags library symbols.
-#define GFLAGS_NAMESPACE google
-
-// ---------------------------------------------------------------------------
-// Windows DLL import/export.
-
-// We always want to import the symbols of the gflags library
-#ifndef GFLAGS_DLL_DECL
-#  if 0 && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL __declspec(dllimport)
-#  else
-#    define GFLAGS_DLL_DECL
-#  endif
-#endif
-
-// We always want to import variables declared in user code
-#ifndef GFLAGS_DLL_DECLARE_FLAG
-#  ifdef _MSC_VER
-#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
-#  else
-#    define GFLAGS_DLL_DECLARE_FLAG
-#  endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Flag types
-#include <string>
-#if 1
-#  include <stdint.h>                   // the normal place uint32_t is defined
-#elif 1
-#  include <sys/types.h>                // the normal place u_int32_t is defined
-#elif 0
-#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
-#endif
-
-namespace GFLAGS_NAMESPACE {
-
-#if 0 // C99
-typedef int32_t          int32;
-typedef uint32_t         uint32;
-typedef int64_t          int64;
-typedef uint64_t         uint64;
-#elif 0 // BSD
-typedef int32_t          int32;
-typedef u_int32_t        uint32;
-typedef int64_t          int64;
-typedef u_int64_t        uint64;
-#elif 1 // Windows
-typedef __int32          int32;
-typedef unsigned __int32 uint32;
-typedef __int64          int64;
-typedef unsigned __int64 uint64;
-#else
-#  error Do not know how to define a 32-bit integer quantity on your system
-#endif
-
-} // namespace GFLAGS_NAMESPACE
-
-
-namespace fLS {
-
-// The meaning of "string" might be different between now and when the
-// macros below get invoked (e.g., if someone is experimenting with
-// other string implementations that get defined after this file is
-// included).  Save the current meaning now and use it in the macros.
-typedef std::string clstring;
-
-} // namespace fLS
-
-
-#define DECLARE_VARIABLE(type, shorttype, name) \
-  /* We always want to import declared variables, dll or no */ \
-  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
-  using fL##shorttype::FLAGS_##name
-
-#define DECLARE_bool(name) \
-  DECLARE_VARIABLE(bool, B, name)
-
-#define DECLARE_int32(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
-
-#define DECLARE_int64(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
-
-#define DECLARE_uint64(name) \
-  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
-
-#define DECLARE_double(name) \
-  DECLARE_VARIABLE(double, D, name)
-
-#define DECLARE_string(name) \
-  /* We always want to import declared variables, dll or no */ \
-  namespace fLS { \
-  using ::fLS::clstring; \
-  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
-  } \
-  using fLS::FLAGS_##name
-
-
-#endif  // GFLAGS_DECLARE_H_
diff --git a/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h b/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
deleted file mode 100644
index 0c17825d..00000000
--- a/files/third_party/gflags/gen/win/include/gflags/gflags_gflags.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2014, Andreas Schuh
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// -----------------------------------------------------------------------------
-// Imports the gflags library symbols into an alternative/deprecated namespace.
-
-#ifndef GFLAGS_GFLAGS_H_
-#  error The internal header gflags_gflags.h may only be included by gflags.h
-#endif
-
-#ifndef GFLAGS_NS_GFLAGS_H_
-#define GFLAGS_NS_GFLAGS_H_
-
-
-namespace gflags {
-
-
-using GFLAGS_NAMESPACE::int32;
-using GFLAGS_NAMESPACE::uint32;
-using GFLAGS_NAMESPACE::int64;
-using GFLAGS_NAMESPACE::uint64;
-
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::CommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetAllFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlags;
-using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
-using GFLAGS_NAMESPACE::DescribeOneFlag;
-using GFLAGS_NAMESPACE::SetArgv;
-using GFLAGS_NAMESPACE::GetArgvs;
-using GFLAGS_NAMESPACE::GetArgv;
-using GFLAGS_NAMESPACE::GetArgv0;
-using GFLAGS_NAMESPACE::GetArgvSum;
-using GFLAGS_NAMESPACE::ProgramInvocationName;
-using GFLAGS_NAMESPACE::ProgramInvocationShortName;
-using GFLAGS_NAMESPACE::ProgramUsage;
-using GFLAGS_NAMESPACE::VersionString;
-using GFLAGS_NAMESPACE::GetCommandLineOption;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
-using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
-using GFLAGS_NAMESPACE::FlagSettingMode;
-using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
-using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
-using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
-using GFLAGS_NAMESPACE::SetCommandLineOption;
-using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
-using GFLAGS_NAMESPACE::FlagSaver;
-using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
-using GFLAGS_NAMESPACE::ReadFlagsFromString;
-using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
-using GFLAGS_NAMESPACE::ReadFromFlagsFile;
-using GFLAGS_NAMESPACE::BoolFromEnv;
-using GFLAGS_NAMESPACE::Int32FromEnv;
-using GFLAGS_NAMESPACE::Int64FromEnv;
-using GFLAGS_NAMESPACE::Uint64FromEnv;
-using GFLAGS_NAMESPACE::DoubleFromEnv;
-using GFLAGS_NAMESPACE::StringFromEnv;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-using GFLAGS_NAMESPACE::SetVersionString;
-using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
-using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
-using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
-using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
-using GFLAGS_NAMESPACE::FlagRegisterer;
-
-#ifndef SWIG
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-#endif
-
-
-} // namespace gflags
-
-
-#endif  // GFLAGS_NS_GFLAGS_H_
diff --git a/files/third_party/gflags/gen/win/include/private/config.h b/files/third_party/gflags/gen/win/include/private/config.h
deleted file mode 100644
index d541580e..00000000
--- a/files/third_party/gflags/gen/win/include/private/config.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Generated from config.h.in during build configuration using CMake. */
-
-// Note: This header file is only used internally. It is not part of public interface!
-
-// ---------------------------------------------------------------------------
-// System checks
-
-// Define if you build this library for a MS Windows OS.
-#define OS_WINDOWS
-
-// Define if you have the <stdint.h> header file.
-#define HAVE_STDINT_H
-
-// Define if you have the <sys/types.h> header file.
-#define HAVE_SYS_TYPES_H
-
-// Define if you have the <inttypes.h> header file.
-/* #undef HAVE_INTTYPES_H */
-
-// Define if you have the <sys/stat.h> header file.
-#define HAVE_SYS_STAT_H
-
-// Define if you have the <unistd.h> header file.
-/* #undef HAVE_UNISTD_H */
-
-// Define if you have the <fnmatch.h> header file.
-/* #undef HAVE_FNMATCH_H */
-
-// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
-#define HAVE_SHLWAPI_H
-
-// Define if you have the strtoll function.
-/* #undef HAVE_STRTOLL */
-
-// Define if you have the strtoq function.
-/* #undef HAVE_STRTOQ */
-
-// Define if you have the <pthread.h> header file.
-/* #undef HAVE_PTHREAD */
-
-// Define if your pthread library defines the type pthread_rwlock_t
-/* #undef HAVE_RWLOCK */
-
-// gcc requires this to get PRId64, etc.
-#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
-#  define __STDC_FORMAT_MACROS 1
-#endif
-
-// ---------------------------------------------------------------------------
-// Package information
-
-// Name of package.
-#define PACKAGE gflags
-
-// Define to the full name of this package.
-#define PACKAGE_NAME gflags
-
-// Define to the full name and version of this package.
-#define PACKAGE_STRING gflags 2.2.0
-
-// Define to the one symbol short name of this package.
-#define PACKAGE_TARNAME gflags-2.2.0
-
-// Define to the version of this package.
-#define PACKAGE_VERSION 2.2.0
-
-// Version number of package.
-#define VERSION PACKAGE_VERSION
-
-// Define to the address where bug reports for this package should be sent.
-#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
-
-// ---------------------------------------------------------------------------
-// Path separator
-#ifndef PATH_SEPARATOR
-#  ifdef OS_WINDOWS
-#    define PATH_SEPARATOR  '\\'
-#  else
-#    define PATH_SEPARATOR  '/'
-#  endif
-#endif
-
-// ---------------------------------------------------------------------------
-// Windows
-
-// Whether gflags library is a DLL.
-#ifndef GFLAGS_IS_A_DLL
-#  define GFLAGS_IS_A_DLL 0
-#endif
-
-// Always export symbols when compiling a shared library as this file is only
-// included by internal modules when building the gflags library itself.
-// The gflags_declare.h header file will set it to import these symbols otherwise.
-#ifndef GFLAGS_DLL_DECL
-#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL __declspec(dllexport)
-#  else
-#    define GFLAGS_DLL_DECL
-#  endif
-#endif
-// Flags defined by the gflags library itself must be exported
-#ifndef GFLAGS_DLL_DEFINE_FLAG
-#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
-#endif
-
-#ifdef OS_WINDOWS
-// The unittests import the symbols of the shared gflags library
-#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
-#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
-#  endif
-#  include "windows_port.h"
-#endif
diff --git a/files/third_party/gflags/gflags.gyp b/files/third_party/gflags/gflags.gyp
deleted file mode 100644
index 37f2815a..00000000
--- a/files/third_party/gflags/gflags.gyp
+++ /dev/null
@@ -1,92 +0,0 @@
-#
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a copy of WebRTC's gflags.gyp.
-
-{
-  'variables': {
-    'gflags_root': '<(DEPTH)/third_party/gflags',
-    'conditions': [
-      ['OS=="win"', {
-        'gflags_gen_arch_root': '<(gflags_root)/gen/win',
-      }, {
-        'gflags_gen_arch_root': '<(gflags_root)/gen/posix',
-      }],
-    ],
-  },
-  'targets': [
-    {
-      'target_name': 'gflags',
-      'type': 'static_library',
-      'include_dirs': [
-        '<(gflags_gen_arch_root)/include/gflags',  # For configured files.
-        '<(gflags_gen_arch_root)/include/private',  # For config.h
-        '<(gflags_root)/src/src',  # For everything else.
-      ],
-      'defines': [
-        # These macros exist so flags and symbols are properly
-        # exported when building DLLs. Since we don't build DLLs, we
-        # need to disable them.
-        'GFLAGS_DLL_DECL=',
-        'GFLAGS_DLL_DECLARE_FLAG=',
-        'GFLAGS_DLL_DEFINE_FLAG=',
-      ],
-      'direct_dependent_settings': {
-        'include_dirs': [
-          '<(gflags_gen_arch_root)/include',  # For configured files.
-          '<(gflags_root)/src/src',  # For everything else.
-        ],
-        'defines': [
-          'GFLAGS_DLL_DECL=',
-          'GFLAGS_DLL_DECLARE_FLAG=',
-          'GFLAGS_DLL_DEFINE_FLAG=',
-        ],
-      },
-      'sources': [
-        'src/src/gflags.cc',
-        'src/src/gflags_completions.cc',
-        'src/src/gflags_reporting.cc',
-      ],
-      'conditions': [
-        ['OS=="win"', {
-          'sources': [
-            'src/src/windows_port.cc',
-          ],
-          'msvs_disabled_warnings': [
-            4005,  # WIN32_LEAN_AND_MEAN redefinition.
-            4267,  # Conversion from size_t to "type".
-          ],
-          'configurations': {
-            'Common_Base': {
-              'msvs_configuration_attributes': {
-                'CharacterSet': '2',  # Use Multi-byte Character Set.
-              },
-            },
-          },
-        }],
-        # TODO(andrew): Look into fixing this warning upstream:
-        # http://code.google.com/p/webrtc/issues/detail?id=760
-        ['OS=="win" and clang==1', {
-          'msvs_settings': {
-            'VCCLCompilerTool': {
-              'AdditionalOptions': [
-                '-Wno-microsoft-include',
-              ],
-            },
-          },
-        }],
-        ['clang==1', {
-          'cflags': [
-            '-Wno-microsoft-include',
-          ],
-        }],
-      ],
-    },
-  ],
-}
diff --git a/files/tools/OWNERS b/files/tools/OWNERS
deleted file mode 100644
index f0963525..00000000
--- a/files/tools/OWNERS
+++ /dev/null
@@ -1,61 +0,0 @@
-# You can add new small tools to this directory at your desire, feel free
-# to owners-TBR new folders (assuming you have a regular review already,
-# of course). Include an OWNERS file with at least two people for your new
-# folder.
-# If you're changing existing tools, have your change reviewed by the
-# OWNERS of the existing tool.
-
-dpranke@chromium.org
-scottmg@chromium.org
-thakis@chromium.org
-
-# These aren't actually great contact points for this directory, but
-# changes in this directory are rare and most changes happen in better-owned
-# subdirectories.
-#
-# TEAM: infra-dev@chromium.org
-# COMPONENT: Build
-
-per-file bisect*.py=anantha@chromium.org
-per-file bisect*.py=prasadv@chromium.org
-per-file bisect*.py=robertocn@chromium.org
-per-file run-bisect*.py=prasadv@chromium.org
-per-file run-bisect*.py=robertocn@chromium.org
-per-file prepare-bisect*.py=prasadv@chromium.org
-per-file prepare-bisect*.py=robertocn@chromium.org
-
-per-file boilerplate.py=rsesek@chromium.org
-
-per-file check_git_config.py=iannucci@chromium.org
-per-file check_git_config.py=vadimsh@chromium.org
-
-per-file check_grd_for_unused_strings.py=estade@chromium.org
-
-per-file gyp-explain.py=thakis@chromium.org
-
-per-file gypv8shy.py=jochen@chromium.org
-
-per-file include_tracer.py=thakis@chromium.org
-
-per-file ipc_messages_log.py=yfriedman@chromium.org
-
-per-file licenses.py=file://tools/copyright_scanner/OWNERS
-
-per-file remove_stale_pyc_files.py=dtu@chromium.org
-
-per-file roll_angle.py=kbr@chromium.org
-per-file roll_angle.py=kjellander@chromium.org
-per-file roll_angle.py=geofflang@chromium.org
-per-file roll_webgl_conformance.py=bajones@chromium.org
-per-file roll_webgl_conformance.py=kbr@chromium.org
-per-file roll_webgl_conformance.py=kjellander@chromium.org
-per-file roll_webgl_conformance.py=geofflang@chromium.org
-per-file roll_webgl_conformance.py=zmo@chromium.org
-per-file roll_webrtc.py=kjellander@chromium.org
-
-per-file safely-roll-deps.py=borenet@chromium.org
-
-per-file sort-headers.py=satorux@chromium.org
-per-file sort-sources.py=satorux@chromium.org
-per-file yes_no.py=satorux@chromium.org
-
diff --git a/files/tools/msan/OWNERS b/files/tools/msan/OWNERS
deleted file mode 100644
index ab97cb0f..00000000
--- a/files/tools/msan/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-# pbos@chromium.org
-fbarchard@google.com
-kjellander@google.com
diff --git a/files/tools/msan/blacklist.txt b/files/tools/msan/blacklist.txt
deleted file mode 100644
index 40ea4b83..00000000
--- a/files/tools/msan/blacklist.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# The rules in this file are only applied at compile time. If you can modify the
-# source in question, consider function attributes to disable instrumentation.
-#
-# Please think twice before you add or remove these rules.
-
-# False positive in ffmpeg due to assembly code. http://crbug.com/344505
-fun:ff_get_cpu_flags_x86
-
-# Benign uninits in zlib.
-# http://crbug.com/116277
-fun:*MOZ_Z_deflate*
-# http://crbug.com/418383
-fun:longest_match
-
-# Uninit in zlib with SIMD intrinsic http://crbug.com/426868
-fun:crc_fold512_to_32
-
-# Uninit in OSMesa. http://crbug.com/347967
-fun:unpack_RGBA8888
-fun:unpack_RGB888
-
-# False positives due to use of linux_syscall_support. http://crbug.com/394028
-src:*/breakpad/src/*
-src:*/components/crash/content/app/breakpad_linux.cc
diff --git a/files/tools/ubsan/OWNERS b/files/tools/ubsan/OWNERS
deleted file mode 100644
index 32b7466f..00000000
--- a/files/tools/ubsan/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-# pbos@webrtc.org
-kjellander@google.com
-fbarchard@google.com
diff --git a/files/tools/ubsan/blacklist.txt b/files/tools/ubsan/blacklist.txt
deleted file mode 100644
index e1e3c08a..00000000
--- a/files/tools/ubsan/blacklist.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-#############################################################################
-# UBSan blacklist.
-
-#############################################################################
-# YASM does some funny things that UBsan doesn't like.
-# https://crbug.com/489901
-src:*/third_party/yasm/*
-
-#############################################################################
-# V8 gives too many false positives. Ignore them for now.
-src:*/v8/*
-
-#############################################################################
-# Ignore system libraries.
-src:*/usr/*
-
-#############################################################################
-# V8 UBsan supressions, commented out for now since we are ignorning v8
-# completely.
-# fun:*v8*internal*FastD2I*
-# fun:*v8*internal*ComputeIntegerHash*
-# fun:*v8*internal*ComputeLongHash*
-# fun:*v8*internal*ComputePointerHash*
-# src:*/v8/src/base/bits.cc
-# src:*/v8/src/base/functional.cc
-# Undefined behaviour (integer overflow) is expected but ignored in this
-# function.
-# fun:*JsonParser*ParseJsonNumber*
-
-# Runtime numeric functions.
-# src:*/v8/src/runtime/runtime-numbers.cc
-
-# Shifts of negative numbers
-# fun:*v8*internal*HPositionInfo*TagPosition*
-# fun:*v8*internal*Range*Shl*
-# fun:*v8*internal*RelocInfoWriter*WriteTaggedData*
-
-#############################################################################
-# Undefined arithmetic that can be safely ignored.
-src:*/base/numerics/saturated_arithmetic.h
-src:*/ppapi/shared_impl/id_assignment.h
-
-#############################################################################
-# ICU supressions. Mostly hash functions where integer overflow is OK.
-fun:*hashEntry*
-fun:*LocaleCacheKey*hashCode*
-fun:*google*protobuf*hash*
-fun:*(hash|Hash)*
-
-#############################################################################
-# Bounds blacklist.
-# Array at the end of struct pattern:
-# Maybe UBSan itself can be improved here?
-# e.g.
-# struct blah {
-#   int a;
-#   char foo[2]; // not actually 2
-# }
-src:*/net/disk_cache/blockfile/backend_impl.cc
-src:*/net/disk_cache/blockfile/entry_impl.cc
-src:*/third_party/icu/source/common/rbbi.cpp
-src:*/third_party/icu/source/common/rbbitblb.cpp
-src:*/third_party/icu/source/common/ucmndata.c
-
-#############################################################################
-# Delete in destructor on a this where this == nullptr
-fun:*re2*RegexpD*
-
-#############################################################################
-# Harmless float division by zero.
-fun:*RendererFrameManager*CullUnlockedFrames*
-
-#############################################################################
-# libc++ __tree and map are not UBSAN clean
-# https://llvm.org/bugs/show_bug.cgi?id=19302
-src:*/third_party/libc\+\+/trunk/include/__tree
-src:*/third_party/libc\+\+/trunk/include/map
diff --git a/files/tools/ubsan/vptr_blacklist.txt b/files/tools/ubsan/vptr_blacklist.txt
deleted file mode 100644
index e8382039..00000000
--- a/files/tools/ubsan/vptr_blacklist.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-#############################################################################
-# UBSan vptr blacklist.
-# Function and type based blacklisting use a mangled name, and it is especially
-# tricky to represent C++ types. For now, any possible changes by name manglings
-# are simply represented as wildcard expressions of regexp, and thus it might be
-# over-blacklisted.
-
-#############################################################################
-# Identical layouts.
-# If base and derived classes have identifical memory layouts (i.e., the same
-# object size) and both have no virtual functions, we blacklist them as there
-# would be not much security implications.
-
-fun:*LifecycleNotifier*addObserver*
-fun:*LifecycleNotifier*removeObserver*
-fun:*toWebInputElement*
-type:*base*MessageLoopForIO*
-type:*BlockRefType*
-type:*SkAutoTUnref*
-type:*WDResult*
-type:*ExecutionContext*
-type:*WebInputElement*
-type:*WebFormControlElement*
-
-# Avoid identical layout cases for 86 different classes in InspectorTypeBuilder,
-# all of which are guarded using COMPILER_ASSERT on the object size. Two more
-# types are also blacklisted due to the template class (JSONArray <-> Array<T>).
-
-src:*InspectorTypeBuilder.h*
-type:*TypeBuilder*
-type:*JSONArray*
-
-#############################################################################
-# Base class's constructor accesses a derived class's member.
-
-fun:*DoublyLinkedListNode*
-type:*content*WebUIExtensionData*
-
-# RenderFrameObserverTracker<T>::RenderFrameObserverTracker()
-fun:*content*RenderFrameObserverTracker*RenderFrame*
-
-# RenderViewObserverTracker<T>::RenderViewObserverTracker()
-fun:*content*RenderViewObserverTracker*RenderView*
-
-#############################################################################
-# Base class's destructor accesses a derived class.
-
-fun:*DatabaseContext*contextDestroyed*
-
-# FIXME: Cannot handle template function LifecycleObserver<>::setContext,
-# so exclude source file for now. 
-src:*LifecycleObserver.h*
-
-#############################################################################
-# static_cast into itself in the constructor.
-
-fun:*RefCountedGarbageCollected*makeKeepAlive*
-fun:*ThreadSafeRefCountedGarbageCollected*makeKeepAlive*
-
-#############################################################################
-# Accessing data in destructors where the class has virtual inheritances.
-
-type:*content*RenderWidgetHost*
-
-# Match mangled name for X::~X().
-fun:*content*RenderThreadImplD*
-fun:*content*RenderViewHostImplD*
-fun:*content*UtilityThreadImplD*
-
-#############################################################################
-# Using raw pointer values.
-#
-# A raw pointer value (16) is used to infer the field offset by
-# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
-
-src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
-src:*/third_party/protobuf/src/google/protobuf/compiler/cpp/cpp_message.cc
-src:*/third_party/protobuf/src/google/protobuf/descriptor.pb.cc
-
-#############################################################################
-# Avoid link errors.
-# Ubsan vptr needs typeinfo on the target class, but it looks like typeinfo is
-# not avaiable if the class is not exported.  For now, simply blacklisted to
-# avoid link errors; e.g., undefined reference to 'typeinfo for [CLASS_NAME]'.
-
-# obj/ppapi/libppapi_proxy.a(obj/ppapi/proxy/ppapi_proxy.proxy_channel.o):../../ppapi/proxy/proxy_channel.cc:__unnamed_53: error: undefined reference to 'typeinfo for IPC::TestSink'
-src:*/ppapi/proxy/proxy_channel.cc
-
-# obj/chrome/libbrowser.a(obj/chrome/browser/net/browser.predictor.o):../../chrome/browser/net/predictor.cc:__unnamed_577: error: undefined reference to 'typeinfo for ProxyAdvisor'
-src:*/chrome/browser/net/predictor.cc
-
-# obj/third_party/pdfium/libfpdfapi.a(obj/third_party/pdfium/core/src/fpdfapi/fpdf_render/fpdfapi.fpdf_render_text.o):../../third_party/pdfium/core/src/fpdfapi/fpdf_render/:__unnamed_360: error: undefined reference to 'typeinfo for CPDF_InlineImages'
-src:*/third_party/pdfium/core/src/fpdfapi/fpdf_render/fpdf_render_text.cpp
-
-# obj/third_party/libwebm/libwebm.a(obj/third_party/libwebm/source/libwebm.mkvmuxer.o)(.data.rel..L__unnamed_2+0x18): error: undefined reference to 'typeinfo for mkvparser::IMkvReader'
-src:*/third_party/libwebm/source/mkvmuxer.cpp
-
-#############################################################################
-# LLVM is not UBSan vptr clean.
-src:*third_party/swiftshader/third_party/LLVM*
-
-#############################################################################
-# UBSan seems to be emit false positives when virtual base classes are
-# involved, see e.g. crbug.com/448102.
-
-type:*v8*internal*OFStream*
-
-#############################################################################
-# UBsan is unable to handle static_cast<A*>(nullptr) and crashes on SIGSEGV.
-#
-
-# static_cast<StartPageService*> in StartPageServiceFactory::GetForProfile.
-type:*StartPageService*
-
-# Remove once function attribute level blacklisting is implemented.
-# See crbug.com/476063.
-fun:*forbidGCDuringConstruction*
-
-#############################################################################
-# UBsan goes into an infinite recursion when __dynamic_cast instrumented with
-# "vptr". See crbug.com/609786.
-
-src:*/third_party/libc\+\+abi/trunk/src/private_typeinfo.cpp
-
-#############################################################################
-# invalid downcasts for IPC messages
-# https://crbug.com/520760
-src:*nacl_message_scanner.cc
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
deleted file mode 100755
index 8359d309..00000000
--- a/files/tools_libyuv/autoroller/roll_deps.py
+++ /dev/null
@@ -1,507 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a modified copy of the script in
-# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
-# customized for libyuv.
-
-
-"""Script to automatically roll dependencies in the libyuv DEPS file."""
-
-import argparse
-import base64
-import collections
-import logging
-import os
-import re
-import subprocess
-import sys
-import urllib2
-
-
-# Skip these dependencies (list without solution name prefix).
-DONT_AUTOROLL_THESE = [
-  'src/third_party/gflags/src',
-]
-
-LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
-CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
-CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
-CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
-CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
-
-COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
-ROLL_BRANCH_NAME = 'roll_chromium_revision'
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir,
-                                                 os.pardir))
-CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
-
-sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
-import find_depot_tools
-find_depot_tools.add_depot_tools_to_path()
-
-CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
-CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
-                                              'clang', 'scripts', 'update.py')
-
-DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
-ChangedDep = collections.namedtuple('ChangedDep',
-                                    'path url current_rev new_rev')
-
-class RollError(Exception):
-  pass
-
-
-def VarLookup(local_scope):
-  return lambda var_name: local_scope['vars'][var_name]
-
-
-def ParseDepsDict(deps_content):
-  local_scope = {}
-  global_scope = {
-    'Var': VarLookup(local_scope),
-    'deps_os': {},
-  }
-  exec(deps_content, global_scope, local_scope)
-  return local_scope
-
-
-def ParseLocalDepsFile(filename):
-  with open(filename, 'rb') as f:
-    deps_content = f.read()
-  return ParseDepsDict(deps_content)
-
-
-def ParseRemoteCrDepsFile(revision):
-  deps_content = ReadRemoteCrFile('DEPS', revision)
-  return ParseDepsDict(deps_content)
-
-
-def ParseCommitPosition(commit_message):
-  for line in reversed(commit_message.splitlines()):
-    m = COMMIT_POSITION_RE.match(line.strip())
-    if m:
-      return int(m.group(1))
-  logging.error('Failed to parse commit position id from:\n%s\n',
-                commit_message)
-  sys.exit(-1)
-
-
-def _RunCommand(command, working_dir=None, ignore_exit_code=False,
-                extra_env=None):
-  """Runs a command and returns the output from that command.
-
-  If the command fails (exit code != 0), the function will exit the process.
-
-  Returns:
-    A tuple containing the stdout and stderr outputs as strings.
-  """
-  working_dir = working_dir or CHECKOUT_SRC_DIR
-  logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
-  env = os.environ.copy()
-  if extra_env:
-    assert all(isinstance(value, str) for value in extra_env.values())
-    logging.debug('extra env: %s', extra_env)
-    env.update(extra_env)
-  p = subprocess.Popen(command, stdout=subprocess.PIPE,
-                       stderr=subprocess.PIPE, env=env,
-                       cwd=working_dir, universal_newlines=True)
-  std_output = p.stdout.read()
-  err_output = p.stderr.read()
-  p.wait()
-  p.stdout.close()
-  p.stderr.close()
-  if not ignore_exit_code and p.returncode != 0:
-    logging.error('Command failed: %s\n'
-                  'stdout:\n%s\n'
-                  'stderr:\n%s\n', ' '.join(command), std_output, err_output)
-    sys.exit(p.returncode)
-  return std_output, err_output
-
-
-def _GetBranches():
-  """Returns a tuple of active,branches.
-
-  The 'active' is the name of the currently active branch and 'branches' is a
-  list of all branches.
-  """
-  lines = _RunCommand(['git', 'branch'])[0].split('\n')
-  branches = []
-  active = ''
-  for line in lines:
-    if '*' in line:
-      # The assumption is that the first char will always be the '*'.
-      active = line[1:].strip()
-      branches.append(active)
-    else:
-      branch = line.strip()
-      if branch:
-        branches.append(branch)
-  return active, branches
-
-
-def _ReadGitilesContent(url):
-  # Download and decode BASE64 content until
-  # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
-  base64_content = ReadUrlContent(url + '?format=TEXT')
-  return base64.b64decode(base64_content[0])
-
-
-def ReadRemoteCrFile(path_below_src, revision):
-  """Reads a remote Chromium file of a specific revision. Returns a string."""
-  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision,
-                                                       path_below_src))
-
-
-def ReadRemoteCrCommit(revision):
-  """Reads a remote Chromium commit message. Returns a string."""
-  return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
-
-
-def ReadUrlContent(url):
-  """Connect to a remote host and read the contents. Returns a list of lines."""
-  conn = urllib2.urlopen(url)
-  try:
-    return conn.readlines()
-  except IOError as e:
-    logging.exception('Error connecting to %s. Error: %s', url, e)
-    raise
-  finally:
-    conn.close()
-
-
-def GetMatchingDepsEntries(depsentry_dict, dir_path):
-  """Gets all deps entries matching the provided path.
-
-  This list may contain more than one DepsEntry object.
-  Example: dir_path='src/testing' would give results containing both
-  'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS.
-  Example 2: dir_path='src/build' should return 'src/build' but not
-  'src/buildtools'.
-
-  Returns:
-    A list of DepsEntry objects.
-  """
-  result = []
-  for path, depsentry in depsentry_dict.iteritems():
-    if path == dir_path:
-      result.append(depsentry)
-    else:
-      parts = path.split('/')
-      if all(part == parts[i]
-             for i, part in enumerate(dir_path.split('/'))):
-        result.append(depsentry)
-  return result
-
-
-def BuildDepsentryDict(deps_dict):
-  """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict."""
-  result = {}
-  def AddDepsEntries(deps_subdict):
-    for path, deps_url_spec in deps_subdict.iteritems():
-      # The deps url is either an URL and a condition, or just the URL.
-      if isinstance(deps_url_spec, dict):
-        if deps_url_spec.get('dep_type') == 'cipd':
-          continue
-        deps_url = deps_url_spec['url']
-      else:
-        deps_url = deps_url_spec
-
-      if not result.has_key(path):
-        url, revision = deps_url.split('@') if deps_url else (None, None)
-        result[path] = DepsEntry(path, url, revision)
-
-  AddDepsEntries(deps_dict['deps'])
-  for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
-    AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
-  return result
-
-
-def CalculateChangedDeps(libyuv_deps, new_cr_deps):
-  """
-  Calculate changed deps entries based on entries defined in the libyuv DEPS
-  file:
-     - If a shared dependency with the Chromium DEPS file: roll it to the same
-       revision as Chromium (i.e. entry in the new_cr_deps dict)
-     - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
-       this means it may be ahead of the chromium_revision, but generally these
-       should be close).
-     - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
-       unless it's configured to be skipped.
-
-  Returns:
-    A list of ChangedDep objects representing the changed deps.
-  """
-  result = []
-  libyuv_entries = BuildDepsentryDict(libyuv_deps)
-  new_cr_entries = BuildDepsentryDict(new_cr_deps)
-  for path, libyuv_deps_entry in libyuv_entries.iteritems():
-    if path in DONT_AUTOROLL_THESE:
-      continue
-    cr_deps_entry = new_cr_entries.get(path)
-    if cr_deps_entry:
-      # Use the revision from Chromium's DEPS file.
-      new_rev = cr_deps_entry.revision
-      assert libyuv_deps_entry.url == cr_deps_entry.url, (
-        'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' %
-        (path, libyuv_deps_entry.url, cr_deps_entry.url))
-    else:
-      # Use the HEAD of the deps repo.
-      stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url,
-                               'HEAD'])
-      new_rev = stdout.strip().split('\t')[0]
-
-    # Check if an update is necessary.
-    if libyuv_deps_entry.revision != new_rev:
-      logging.debug('Roll dependency %s to %s', path, new_rev)
-      result.append(ChangedDep(path, libyuv_deps_entry.url,
-                               libyuv_deps_entry.revision, new_rev))
-  return sorted(result)
-
-
-def CalculateChangedClang(new_cr_rev):
-  def GetClangRev(lines):
-    for line in lines:
-      match = CLANG_REVISION_RE.match(line)
-      if match:
-        return match.group(1)
-    raise RollError('Could not parse Clang revision!')
-
-  with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f:
-    current_lines = f.readlines()
-  current_rev = GetClangRev(current_lines)
-
-  new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
-                                             new_cr_rev).splitlines()
-  new_rev = GetClangRev(new_clang_update_py)
-  return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
-
-
-def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos,
-                          new_commit_pos, changed_deps_list, clang_change):
-  current_cr_rev = current_cr_rev[0:10]
-  new_cr_rev = new_cr_rev[0:10]
-  rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
-  git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
-
-  commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval,
-                                                    git_number_interval)]
-  commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
-  commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
-                                         rev_interval))
-  if changed_deps_list:
-    commit_msg.append('Changed dependencies:')
-
-    for c in changed_deps_list:
-      commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url,
-                                                  c.current_rev[0:10],
-                                                  c.new_rev[0:10]))
-    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
-    commit_msg.append('DEPS diff: %s\n' % change_url)
-  else:
-    commit_msg.append('No dependencies changed.')
-
-  if clang_change.current_rev != clang_change.new_rev:
-    commit_msg.append('Clang version changed %s:%s' %
-                      (clang_change.current_rev, clang_change.new_rev))
-    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
-                                           CLANG_UPDATE_SCRIPT_URL_PATH)
-    commit_msg.append('Details: %s\n' % change_url)
-  else:
-    commit_msg.append('No update to Clang.\n')
-
-  # TBR needs to be non-empty for Gerrit to process it.
-  git_author = _RunCommand(['git', 'config', 'user.email'],
-                           working_dir=CHECKOUT_SRC_DIR)[0].strip()
-  commit_msg.append('TBR=%s' % git_author)
-
-  commit_msg.append('BUG=None')
-  return '\n'.join(commit_msg)
-
-
-def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision,
-                   changed_deps):
-  """Update the DEPS file with the new revision."""
-
-  # Update the chromium_revision variable.
-  with open(deps_filename, 'rb') as deps_file:
-    deps_content = deps_file.read()
-  deps_content = deps_content.replace(old_cr_revision, new_cr_revision)
-  with open(deps_filename, 'wb') as deps_file:
-    deps_file.write(deps_content)
-
-  # Update each individual DEPS entry.
-  for dep in changed_deps:
-    local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
-    if not os.path.isdir(local_dep_dir):
-      raise RollError(
-          'Cannot find local directory %s. Make sure the .gclient file\n'
-          'contains all platforms in the target_os list, i.e.\n'
-          'target_os = ["android", "unix", "mac", "ios", "win"];\n'
-          'Then run "gclient sync" again.' % local_dep_dir)
-    _RunCommand(
-      ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)],
-      working_dir=CHECKOUT_SRC_DIR)
-
-
-def _IsTreeClean():
-  stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
-  if len(stdout) == 0:
-    return True
-
-  logging.error('Dirty/unversioned files:\n%s', stdout)
-  return False
-
-
-def _EnsureUpdatedMasterBranch(dry_run):
-  current_branch = _RunCommand(
-      ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0]
-  if current_branch != 'master':
-    logging.error('Please checkout the master branch and re-run this script.')
-    if not dry_run:
-      sys.exit(-1)
-
-  logging.info('Updating master branch...')
-  _RunCommand(['git', 'pull'])
-
-
-def _CreateRollBranch(dry_run):
-  logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
-  if not dry_run:
-    _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
-
-
-def _RemovePreviousRollBranch(dry_run):
-  active_branch, branches = _GetBranches()
-  if active_branch == ROLL_BRANCH_NAME:
-    active_branch = 'master'
-  if ROLL_BRANCH_NAME in branches:
-    logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
-    if not dry_run:
-      _RunCommand(['git', 'checkout', active_branch])
-      _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
-
-
-def _LocalCommit(commit_msg, dry_run):
-  logging.info('Committing changes locally.')
-  if not dry_run:
-    _RunCommand(['git', 'add', '--update', '.'])
-    _RunCommand(['git', 'commit', '-m', commit_msg])
-
-
-def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
-  if skip_cq:
-    return 0
-  if (new_commit_pos - current_commit_pos) < cq_over:
-    return 1
-  return 2
-
-
-def _UploadCL(commit_queue_mode):
-  """Upload the committed changes as a changelist to Gerrit.
-
-  commit_queue_mode:
-    - 2: Submit to commit queue.
-    - 1: Run trybots but do not submit to CQ.
-    - 0: Skip CQ, upload only.
-  """
-  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
-  if commit_queue_mode >= 2:
-    logging.info('Sending the CL to the CQ...')
-    cmd.extend(['--use-commit-queue'])
-  elif commit_queue_mode >= 1:
-    logging.info('Starting CQ dry run...')
-    cmd.extend(['--cq-dry-run'])
-  extra_env = {
-      'EDITOR': 'true',
-      'SKIP_GCE_AUTH_FOR_GIT': '1',
-  }
-  stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
-  logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
-      stdout, stderr)
-
-
-def main():
-  p = argparse.ArgumentParser()
-  p.add_argument('--clean', action='store_true', default=False,
-                 help='Removes any previous local roll branch.')
-  p.add_argument('-r', '--revision',
-                 help=('Chromium Git revision to roll to. Defaults to the '
-                       'Chromium HEAD revision if omitted.'))
-  p.add_argument('--dry-run', action='store_true', default=False,
-                 help=('Calculate changes and modify DEPS, but don\'t create '
-                       'any local branch, commit, upload CL or send any '
-                       'tryjobs.'))
-  p.add_argument('-i', '--ignore-unclean-workdir', action='store_true',
-                 default=False,
-                 help=('Ignore if the current branch is not master or if there '
-                       'are uncommitted changes (default: %(default)s).'))
-  grp = p.add_mutually_exclusive_group()
-  grp.add_argument('--skip-cq', action='store_true', default=False,
-                   help='Skip sending the CL to the CQ (default: %(default)s)')
-  grp.add_argument('--cq-over', type=int, default=1,
-                   help=('Commit queue dry run if the revision difference '
-                         'is below this number (default: %(default)s)'))
-  p.add_argument('-v', '--verbose', action='store_true', default=False,
-                 help='Be extra verbose in printing of log messages.')
-  opts = p.parse_args()
-
-  if opts.verbose:
-    logging.basicConfig(level=logging.DEBUG)
-  else:
-    logging.basicConfig(level=logging.INFO)
-
-  if not opts.ignore_unclean_workdir and not _IsTreeClean():
-    logging.error('Please clean your local checkout first.')
-    return 1
-
-  if opts.clean:
-    _RemovePreviousRollBranch(opts.dry_run)
-
-  if not opts.ignore_unclean_workdir:
-    _EnsureUpdatedMasterBranch(opts.dry_run)
-
-  new_cr_rev = opts.revision
-  if not new_cr_rev:
-    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
-    head_rev = stdout.strip().split('\t')[0]
-    logging.info('No revision specified. Using HEAD: %s', head_rev)
-    new_cr_rev = head_rev
-
-  deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
-  libyuv_deps = ParseLocalDepsFile(deps_filename)
-  current_cr_rev = libyuv_deps['vars']['chromium_revision']
-
-  current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev))
-  new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev))
-
-  new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev)
-  changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
-  clang_change = CalculateChangedClang(new_cr_rev)
-  commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev,
-                                     current_commit_pos, new_commit_pos,
-                                     changed_deps, clang_change)
-  logging.debug('Commit message:\n%s', commit_msg)
-
-  _CreateRollBranch(opts.dry_run)
-  UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
-  _LocalCommit(commit_msg, opts.dry_run)
-  commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
-                                   current_commit_pos, new_commit_pos)
-  logging.info('Uploading CL...')
-  if not opts.dry_run:
-    _UploadCL(commit_queue_mode)
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/.DS_Store b/files/tools_libyuv/autoroller/unittests/.DS_Store
deleted file mode 100644
index 70369d69..00000000
--- a/files/tools_libyuv/autoroller/unittests/.DS_Store
+++ /dev/null
diff --git a/files/tools_libyuv/valgrind/chrome_tests.bat b/files/tools_libyuv/valgrind/chrome_tests.bat
deleted file mode 100755
index 9d4c8ca8..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.bat
+++ /dev/null
@@ -1,53 +0,0 @@
-@echo off
-:: Copyright (c) 2011 The Chromium Authors. All rights reserved.
-:: Use of this source code is governed by a BSD-style license that can be
-:: found in the LICENSE file.
-
-setlocal
-
-set THISDIR=%~dp0
-set TOOL_NAME="unknown"
-
-:: Get the tool name and put it into TOOL_NAME {{{1
-:: NB: SHIFT command doesn't modify %*
-:PARSE_ARGS_LOOP
-  if %1 == () GOTO:TOOLNAME_NOT_FOUND
-  if %1 == --tool GOTO:TOOLNAME_FOUND
-  SHIFT
-  goto :PARSE_ARGS_LOOP
-
-:TOOLNAME_NOT_FOUND
-echo "Please specify a tool (e.g. drmemory) by using --tool flag"
-exit /B 1
-
-:TOOLNAME_FOUND
-SHIFT
-set TOOL_NAME=%1
-:: }}}
-if "%TOOL_NAME%" == "drmemory"          GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_light"    GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_full"     GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_pattern"  GOTO :SETUP_DRMEMORY
-echo "Unknown tool: `%TOOL_NAME%`! Only drmemory is supported right now"
-exit /B 1
-
-:SETUP_DRMEMORY
-:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
-set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
-set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
-if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
-echo "Can't find Dr. Memory executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:DRMEMORY_BINARY_OK
-%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
-set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
-:: }}}
-goto :RUN_TESTS
-
-:RUN_TESTS
-set PYTHONPATH=%THISDIR%../python/google
-set RUNNING_ON_VALGRIND=yes
-python %THISDIR%/chrome_tests.py %*
diff --git a/files/tools_libyuv/valgrind/chrome_tests.py b/files/tools_libyuv/valgrind/chrome_tests.py
deleted file mode 100755
index fe899bce..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.py
+++ /dev/null
@@ -1,869 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-''' Runs various chrome tests through valgrind_test.py.'''
-
-import glob
-import logging
-import multiprocessing
-import optparse
-import os
-import stat
-import subprocess
-import sys
-
-import logging_utils
-import path_utils
-
-import common
-import valgrind_test
-
-class TestNotFound(Exception): pass
-
-class MultipleGTestFiltersSpecified(Exception): pass
-
-class BuildDirNotFound(Exception): pass
-
-class BuildDirAmbiguous(Exception): pass
-
-class ExecutableNotFound(Exception): pass
-
-class BadBinary(Exception): pass
-
-class ChromeTests:
-  SLOW_TOOLS = ["memcheck", "drmemory"]
-  LAYOUT_TESTS_DEFAULT_CHUNK_SIZE = 300
-
-  def __init__(self, options, args, test):
-    if ':' in test:
-      (self._test, self._gtest_filter) = test.split(':', 1)
-    else:
-      self._test = test
-      self._gtest_filter = options.gtest_filter
-
-    if self._test not in self._test_list:
-      raise TestNotFound("Unknown test: %s" % test)
-
-    if options.gtest_filter and options.gtest_filter != self._gtest_filter:
-      raise MultipleGTestFiltersSpecified("Can not specify both --gtest_filter "
-                                          "and --test %s" % test)
-
-    self._options = options
-    self._args = args
-
-    script_dir = path_utils.ScriptDir()
-    # Compute the top of the tree (the "source dir") from the script dir (where
-    # this script lives).  We assume that the script dir is in tools/valgrind/
-    # relative to the top of the tree.
-    self._source_dir = os.path.dirname(os.path.dirname(script_dir))
-    # since this path is used for string matching, make sure it's always
-    # an absolute Unix-style path
-    self._source_dir = os.path.abspath(self._source_dir).replace('\\', '/')
-    valgrind_test_script = os.path.join(script_dir, "valgrind_test.py")
-    self._command_preamble = ["--source-dir=%s" % (self._source_dir)]
-
-    if not self._options.build_dir:
-      dirs = [
-        os.path.join(self._source_dir, "xcodebuild", "Debug"),
-        os.path.join(self._source_dir, "out", "Debug"),
-        os.path.join(self._source_dir, "build", "Debug"),
-      ]
-      build_dir = [d for d in dirs if os.path.isdir(d)]
-      if len(build_dir) > 1:
-        raise BuildDirAmbiguous("Found more than one suitable build dir:\n"
-                                "%s\nPlease specify just one "
-                                "using --build-dir" % ", ".join(build_dir))
-      elif build_dir:
-        self._options.build_dir = build_dir[0]
-      else:
-        self._options.build_dir = None
-
-    if self._options.build_dir:
-      build_dir = os.path.abspath(self._options.build_dir)
-      self._command_preamble += ["--build-dir=%s" % (self._options.build_dir)]
-
-  def _EnsureBuildDirFound(self):
-    if not self._options.build_dir:
-      raise BuildDirNotFound("Oops, couldn't find a build dir, please "
-                             "specify it manually using --build-dir")
-
-  def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
-    '''Generates the default command array that most tests will use.'''
-    if exe and common.IsWindows():
-      exe += '.exe'
-
-    cmd = list(self._command_preamble)
-
-    # Find all suppressions matching the following pattern:
-    # tools/valgrind/TOOL/suppressions[_PLATFORM].txt
-    # and list them with --suppressions= prefix.
-    script_dir = path_utils.ScriptDir()
-    tool_name = tool.ToolName();
-    suppression_file = os.path.join(script_dir, tool_name, "suppressions.txt")
-    if os.path.exists(suppression_file):
-      cmd.append("--suppressions=%s" % suppression_file)
-    # Platform-specific suppression
-    for platform in common.PlatformNames():
-      platform_suppression_file = \
-          os.path.join(script_dir, tool_name, 'suppressions_%s.txt' % platform)
-      if os.path.exists(platform_suppression_file):
-        cmd.append("--suppressions=%s" % platform_suppression_file)
-
-    if tool_name == "drmemory":
-      if self._options.drmemory_ops:
-        # prepending " " to avoid Dr. Memory's option confusing optparse
-        cmd += ["--drmemory_ops", " " + self._options.drmemory_ops]
-
-    if self._options.valgrind_tool_flags:
-      cmd += self._options.valgrind_tool_flags.split(" ")
-    if self._options.keep_logs:
-      cmd += ["--keep_logs"]
-    if valgrind_test_args != None:
-      for arg in valgrind_test_args:
-        cmd.append(arg)
-    if exe:
-      self._EnsureBuildDirFound()
-      exe_path = os.path.join(self._options.build_dir, exe)
-      if not os.path.exists(exe_path):
-        raise ExecutableNotFound("Couldn't find '%s'" % exe_path)
-
-      # Make sure we don't try to test ASan-built binaries
-      # with other dynamic instrumentation-based tools.
-      # TODO(timurrrr): also check TSan and MSan?
-      # `nm` might not be available, so use try-except.
-      try:
-        # Do not perform this check on OS X, as 'nm' on 10.6 can't handle
-        # binaries built with Clang 3.5+.
-        if not common.IsMac():
-          nm_output = subprocess.check_output(["nm", exe_path])
-          if nm_output.find("__asan_init") != -1:
-            raise BadBinary("You're trying to run an executable instrumented "
-                            "with AddressSanitizer under %s. Please provide "
-                            "an uninstrumented executable." % tool_name)
-      except OSError:
-        pass
-
-      cmd.append(exe_path)
-      # Valgrind runs tests slowly, so slow tests hurt more; show elapased time
-      # so we can find the slowpokes.
-      cmd.append("--gtest_print_time")
-      # Built-in test launcher for gtest-based executables runs tests using
-      # multiple process by default. Force the single-process mode back.
-      cmd.append("--single-process-tests")
-    if self._options.gtest_repeat:
-      cmd.append("--gtest_repeat=%s" % self._options.gtest_repeat)
-    if self._options.gtest_shuffle:
-      cmd.append("--gtest_shuffle")
-    if self._options.gtest_break_on_failure:
-      cmd.append("--gtest_break_on_failure")
-    if self._options.test_launcher_bot_mode:
-      cmd.append("--test-launcher-bot-mode")
-    if self._options.test_launcher_total_shards is not None:
-      cmd.append("--test-launcher-total-shards=%d"
-                 % self._options.test_launcher_total_shards)
-    if self._options.test_launcher_shard_index is not None:
-      cmd.append("--test-launcher-shard-index=%d"
-                 % self._options.test_launcher_shard_index)
-    return cmd
-
-  def Run(self):
-    ''' Runs the test specified by command-line argument --test '''
-    logging.info("running test %s" % (self._test))
-    return self._test_list[self._test](self)
-
-  def _AppendGtestFilter(self, tool, name, cmd):
-    '''Append an appropriate --gtest_filter flag to the googletest binary
-       invocation.
-       If the user passed their own filter mentioning only one test, just use
-       it. Otherwise, filter out tests listed in the appropriate gtest_exclude
-       files.
-    '''
-    if (self._gtest_filter and
-        ":" not in self._gtest_filter and
-        "?" not in self._gtest_filter and
-        "*" not in self._gtest_filter):
-      cmd.append("--gtest_filter=%s" % self._gtest_filter)
-      return
-
-    filters = []
-    gtest_files_dir = os.path.join(path_utils.ScriptDir(), "gtest_exclude")
-
-    gtest_filter_files = [
-        os.path.join(gtest_files_dir, name + ".gtest-%s.txt" % tool.ToolName())]
-    # Use ".gtest.txt" files only for slow tools, as they now contain
-    # Valgrind- and Dr.Memory-specific filters.
-    # TODO(glider): rename the files to ".gtest_slow.txt"
-    if tool.ToolName() in ChromeTests.SLOW_TOOLS:
-      gtest_filter_files += [os.path.join(gtest_files_dir, name + ".gtest.txt")]
-    for platform_suffix in common.PlatformNames():
-      gtest_filter_files += [
-        os.path.join(gtest_files_dir, name + ".gtest_%s.txt" % platform_suffix),
-        os.path.join(gtest_files_dir, name + ".gtest-%s_%s.txt" % \
-            (tool.ToolName(), platform_suffix))]
-    logging.info("Reading gtest exclude filter files:")
-    for filename in gtest_filter_files:
-      # strip the leading absolute path (may be very long on the bot)
-      # and the following / or \.
-      readable_filename = filename.replace("\\", "/")  # '\' on Windows
-      readable_filename = readable_filename.replace(self._source_dir, "")[1:]
-      if not os.path.exists(filename):
-        logging.info("  \"%s\" - not found" % readable_filename)
-        continue
-      logging.info("  \"%s\" - OK" % readable_filename)
-      f = open(filename, 'r')
-      for line in f.readlines():
-        if line.startswith("#") or line.startswith("//") or line.isspace():
-          continue
-        line = line.rstrip()
-        test_prefixes = ["FLAKY", "FAILS"]
-        for p in test_prefixes:
-          # Strip prefixes from the test names.
-          line = line.replace(".%s_" % p, ".")
-        # Exclude the original test name.
-        filters.append(line)
-        if line[-2:] != ".*":
-          # List all possible prefixes if line doesn't end with ".*".
-          for p in test_prefixes:
-            filters.append(line.replace(".", ".%s_" % p))
-    # Get rid of duplicates.
-    filters = set(filters)
-    gtest_filter = self._gtest_filter
-    if len(filters):
-      if gtest_filter:
-        gtest_filter += ":"
-        if gtest_filter.find("-") < 0:
-          gtest_filter += "-"
-      else:
-        gtest_filter = "-"
-      gtest_filter += ":".join(filters)
-    if gtest_filter:
-      cmd.append("--gtest_filter=%s" % gtest_filter)
-
-  @staticmethod
-  def ShowTests():
-    test_to_names = {}
-    for name, test_function in ChromeTests._test_list.iteritems():
-      test_to_names.setdefault(test_function, []).append(name)
-
-    name_to_aliases = {}
-    for names in test_to_names.itervalues():
-      names.sort(key=lambda name: len(name))
-      name_to_aliases[names[0]] = names[1:]
-
-    print
-    print "Available tests:"
-    print "----------------"
-    for name, aliases in sorted(name_to_aliases.iteritems()):
-      if aliases:
-        print "   {} (aka {})".format(name, ', '.join(aliases))
-      else:
-        print "   {}".format(name)
-
-  def SetupLdPath(self, requires_build_dir):
-    if requires_build_dir:
-      self._EnsureBuildDirFound()
-    elif not self._options.build_dir:
-      return
-
-    # Append build_dir to LD_LIBRARY_PATH so external libraries can be loaded.
-    if (os.getenv("LD_LIBRARY_PATH")):
-      os.putenv("LD_LIBRARY_PATH", "%s:%s" % (os.getenv("LD_LIBRARY_PATH"),
-                                              self._options.build_dir))
-    else:
-      os.putenv("LD_LIBRARY_PATH", self._options.build_dir)
-
-  def SimpleTest(self, module, name, valgrind_test_args=None, cmd_args=None):
-    tool = valgrind_test.CreateTool(self._options.valgrind_tool)
-    cmd = self._DefaultCommand(tool, name, valgrind_test_args)
-    self._AppendGtestFilter(tool, name, cmd)
-    cmd.extend(['--test-tiny-timeout=1000'])
-    if cmd_args:
-      cmd.extend(cmd_args)
-
-    self.SetupLdPath(True)
-    return tool.Run(cmd, module)
-
-  def RunCmdLine(self):
-    tool = valgrind_test.CreateTool(self._options.valgrind_tool)
-    cmd = self._DefaultCommand(tool, None, self._args)
-    self.SetupLdPath(False)
-    return tool.Run(cmd, None)
-
-  def TestAccessibility(self):
-    return self.SimpleTest("accessibility", "accessibility_unittests")
-
-  def TestAddressInput(self):
-    return self.SimpleTest("addressinput", "libaddressinput_unittests")
-
-  def TestAngle(self):
-    return self.SimpleTest("angle", "angle_unittests")
-
-  def TestAppList(self):
-    return self.SimpleTest("app_list", "app_list_unittests")
-
-  def TestAsh(self):
-    return self.SimpleTest("ash", "ash_unittests")
-
-  def TestAura(self):
-    return self.SimpleTest("aura", "aura_unittests")
-
-  def TestBase(self):
-    return self.SimpleTest("base", "base_unittests")
-
-  def TestBlinkHeap(self):
-    return self.SimpleTest("blink_heap", "blink_heap_unittests")
-
-  def TestBlinkPlatform(self):
-    return self.SimpleTest("blink_platform", "blink_platform_unittests")
-
-  def TestCacheInvalidation(self):
-    return self.SimpleTest("cacheinvalidation", "cacheinvalidation_unittests")
-
-  def TestCast(self):
-    return self.SimpleTest("chrome", "cast_unittests")
-
-  def TestCC(self):
-    return self.SimpleTest("cc", "cc_unittests",
-                           cmd_args=[
-                               "--cc-layer-tree-test-long-timeout"])
-
-  def TestChromeApp(self):
-    return self.SimpleTest("chrome_app", "chrome_app_unittests")
-
-  def TestChromeElf(self):
-    return self.SimpleTest("chrome_elf", "chrome_elf_unittests")
-
-  def TestChromeDriver(self):
-    return self.SimpleTest("chromedriver", "chromedriver_unittests")
-
-  def TestChromeOS(self):
-    return self.SimpleTest("chromeos", "chromeos_unittests")
-
-  def TestComponents(self):
-    return self.SimpleTest("components", "components_unittests")
-
-  def TestCompositor(self):
-    return self.SimpleTest("compositor", "compositor_unittests")
-
-  def TestContent(self):
-    return self.SimpleTest("content", "content_unittests")
-
-  def TestCourgette(self):
-    return self.SimpleTest("courgette", "courgette_unittests")
-
-  def TestCrypto(self):
-    return self.SimpleTest("crypto", "crypto_unittests")
-
-  def TestDevice(self):
-    return self.SimpleTest("device", "device_unittests")
-
-  def TestDisplay(self):
-    return self.SimpleTest("display", "display_unittests")
-
-  def TestEvents(self):
-    return self.SimpleTest("events", "events_unittests")
-
-  def TestExtensions(self):
-    return self.SimpleTest("extensions", "extensions_unittests")
-
-  def TestFFmpegRegressions(self):
-    return self.SimpleTest("chrome", "ffmpeg_regression_tests")
-
-  def TestGCM(self):
-    return self.SimpleTest("gcm", "gcm_unit_tests")
-
-  def TestGfx(self):
-    return self.SimpleTest("gfx", "gfx_unittests")
-
-  def TestGin(self):
-    return self.SimpleTest("gin", "gin_unittests")
-
-  def TestGoogleApis(self):
-    return self.SimpleTest("google_apis", "google_apis_unittests")
-
-  def TestGPU(self):
-    return self.SimpleTest("gpu", "gpu_unittests")
-
-  def TestIpc(self):
-    return self.SimpleTest("ipc", "ipc_tests",
-                           valgrind_test_args=["--trace_children"])
-
-  def TestInstallerUtil(self):
-    return self.SimpleTest("installer_util", "installer_util_unittests")
-
-  def TestInstallStatic(self):
-    return self.SimpleTest("install_static", "install_static_unittests")
-
-  def TestJingle(self):
-    return self.SimpleTest("chrome", "jingle_unittests")
-
-  def TestKeyboard(self):
-    return self.SimpleTest("keyboard", "keyboard_unittests")
-
-  def TestLatency(self):
-    return self.SimpleTest("latency", "latency_unittests")
-
-  def TestMedia(self):
-    return self.SimpleTest("chrome", "media_unittests")
-
-  def TestMessageCenter(self):
-    return self.SimpleTest("message_center", "message_center_unittests")
-
-  def TestMidi(self):
-    return self.SimpleTest("chrome", "midi_unittests")
-
-  def TestMojoCommon(self):
-    return self.SimpleTest("mojo_common", "mojo_common_unittests")
-
-  def TestMojoPublicBindings(self):
-    return self.SimpleTest("mojo_public_bindings",
-                           "mojo_public_bindings_unittests")
-
-  def TestMojoPublicSystem(self):
-    return self.SimpleTest("mojo_public_system",
-                           "mojo_public_system_unittests")
-
-  def TestMojoPublicSysPerf(self):
-    return self.SimpleTest("mojo_public_sysperf",
-                           "mojo_public_system_perftests")
-
-  def TestMojoSystem(self):
-    return self.SimpleTest("mojo_system", "mojo_system_unittests")
-
-  def TestNet(self):
-    return self.SimpleTest("net", "net_unittests")
-
-  def TestNetPerf(self):
-    return self.SimpleTest("net", "net_perftests")
-
-  def TestPhoneNumber(self):
-    return self.SimpleTest("phonenumber", "libphonenumber_unittests")
-
-  def TestPPAPI(self):
-    return self.SimpleTest("chrome", "ppapi_unittests")
-
-  def TestPrinting(self):
-    return self.SimpleTest("chrome", "printing_unittests")
-
-  def TestRemoting(self):
-    return self.SimpleTest("chrome", "remoting_unittests",
-                           cmd_args=[
-                               "--ui-test-action-timeout=60000",
-                               "--ui-test-action-max-timeout=150000"])
-
-  def TestSkia(self):
-    return self.SimpleTest("skia", "skia_unittests")
-
-  def TestSql(self):
-    return self.SimpleTest("chrome", "sql_unittests")
-
-  def TestStorage(self):
-    return self.SimpleTest("storage", "storage_unittests")
-
-  def TestLinuxSandbox(self):
-    return self.SimpleTest("sandbox", "sandbox_linux_unittests")
-
-  def TestUnit(self):
-    # http://crbug.com/51716
-    # Disabling all unit tests
-    # Problems reappeared after r119922
-    if common.IsMac() and (self._options.valgrind_tool == "memcheck"):
-      logging.warning("unit_tests are disabled for memcheck on MacOS.")
-      return 0;
-    return self.SimpleTest("chrome", "unit_tests")
-
-  def TestUIBaseUnit(self):
-    return self.SimpleTest("chrome", "ui_base_unittests")
-
-  def TestUIChromeOS(self):
-    return self.SimpleTest("chrome", "ui_chromeos_unittests")
-
-  def TestURL(self):
-    return self.SimpleTest("chrome", "url_unittests")
-
-  def TestViews(self):
-    return self.SimpleTest("views", "views_unittests")
-
-
-  # Valgrind timeouts are in seconds.
-  UI_VALGRIND_ARGS = ["--timeout=14400", "--trace_children", "--indirect"]
-  # UI test timeouts are in milliseconds.
-  UI_TEST_ARGS = ["--ui-test-action-timeout=60000",
-                  "--ui-test-action-max-timeout=150000",
-                  "--no-sandbox"]
-
-  # TODO(thestig) fine-tune these values.
-  # Valgrind timeouts are in seconds.
-  BROWSER_VALGRIND_ARGS = ["--timeout=50000", "--trace_children", "--indirect"]
-  # Browser test timeouts are in milliseconds.
-  BROWSER_TEST_ARGS = ["--ui-test-action-timeout=400000",
-                       "--ui-test-action-max-timeout=800000",
-                       "--no-sandbox"]
-
-  def TestBrowser(self):
-    return self.SimpleTest("chrome", "browser_tests",
-                           valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
-                           cmd_args=self.BROWSER_TEST_ARGS)
-
-  def TestContentBrowser(self):
-    return self.SimpleTest("content", "content_browsertests",
-                           valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
-                           cmd_args=self.BROWSER_TEST_ARGS)
-
-  def TestInteractiveUI(self):
-    return self.SimpleTest("chrome", "interactive_ui_tests",
-                           valgrind_test_args=self.UI_VALGRIND_ARGS,
-                           cmd_args=self.UI_TEST_ARGS)
-
-  def TestSyncIntegration(self):
-    return self.SimpleTest("chrome", "sync_integration_tests",
-                           valgrind_test_args=self.UI_VALGRIND_ARGS,
-                           cmd_args=(["--ui-test-action-max-timeout=450000"]))
-
-  def TestLayoutChunk(self, chunk_num, chunk_size):
-    # Run tests [chunk_num*chunk_size .. (chunk_num+1)*chunk_size) from the
-    # list of tests.  Wrap around to beginning of list at end.
-    # If chunk_size is zero, run all tests in the list once.
-    # If a text file is given as argument, it is used as the list of tests.
-    assert((chunk_size == 0) != (len(self._args) == 0))
-    # Build the ginormous commandline in 'cmd'.
-    # It's going to be roughly
-    #  python valgrind_test.py ...
-    # but we'll use the --indirect flag to valgrind_test.py
-    # to avoid valgrinding python.
-    # Start by building the valgrind_test.py commandline.
-    tool = valgrind_test.CreateTool(self._options.valgrind_tool)
-    cmd = self._DefaultCommand(tool)
-    cmd.append("--trace_children")
-    cmd.append("--indirect_webkit_layout")
-    cmd.append("--ignore_exit_code")
-    # Now build script_cmd, the run-webkits-tests commandline.
-    # Store each chunk in its own directory so that we can find the data later
-    chunk_dir = os.path.join("layout", "chunk_%05d" % chunk_num)
-    out_dir = os.path.join(path_utils.ScriptDir(), "latest")
-    out_dir = os.path.join(out_dir, chunk_dir)
-    if os.path.exists(out_dir):
-      old_files = glob.glob(os.path.join(out_dir, "*.txt"))
-      for f in old_files:
-        os.remove(f)
-    else:
-      os.makedirs(out_dir)
-    script = os.path.join(self._source_dir, "third_party", "WebKit", "Tools",
-                          "Scripts", "run-webkit-tests")
-    # http://crbug.com/260627: After the switch to content_shell from DRT, each
-    # test now brings up 3 processes.  Under Valgrind, they become memory bound
-    # and can eventually OOM if we don't reduce the total count.
-    # It'd be nice if content_shell automatically throttled the startup of new
-    # tests if we're low on memory.
-    jobs = max(1, int(multiprocessing.cpu_count() * 0.3))
-    script_cmd = ["python", script, "-v",
-                  # run a separate DumpRenderTree for each test
-                  "--batch-size=1",
-                  "--fully-parallel",
-                  "--child-processes=%d" % jobs,
-                  "--time-out-ms=800000",
-                  "--no-retry-failures",  # retrying takes too much time
-                  # http://crbug.com/176908: Don't launch a browser when done.
-                  "--no-show-results",
-                  "--nocheck-sys-deps",
-                  "--additional-driver-flag=--no-sandbox"]
-    # Pass build mode to run-webkit-tests.  We aren't passed it directly,
-    # so parse it out of build_dir.  run-webkit-tests can only handle
-    # the two values "Release" and "Debug".
-    # TODO(Hercules): unify how all our scripts pass around build mode
-    # (--mode / --target / --build-dir / --debug)
-    if self._options.build_dir:
-      build_root, mode = os.path.split(self._options.build_dir)
-      script_cmd.extend(["--build-directory", build_root, "--target", mode])
-    if (chunk_size > 0):
-      script_cmd.append("--run-chunk=%d:%d" % (chunk_num, chunk_size))
-    if len(self._args):
-      # if the arg is a txt file, then treat it as a list of tests
-      if os.path.isfile(self._args[0]) and self._args[0][-4:] == ".txt":
-        script_cmd.append("--test-list=%s" % self._args[0])
-      else:
-        script_cmd.extend(self._args)
-    self._AppendGtestFilter(tool, "layout", script_cmd)
-    # Now run script_cmd with the wrapper in cmd
-    cmd.extend(["--"])
-    cmd.extend(script_cmd)
-
-    # Layout tests often times fail quickly, but the buildbot remains green.
-    # Detect this situation when running with the default chunk size.
-    if chunk_size == self.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE:
-      min_runtime_in_seconds=120
-    else:
-      min_runtime_in_seconds=0
-    ret = tool.Run(cmd, "layout", min_runtime_in_seconds=min_runtime_in_seconds)
-    return ret
-
-
-  def TestLayout(self):
-    # A "chunk file" is maintained in the local directory so that each test
-    # runs a slice of the layout tests of size chunk_size that increments with
-    # each run.  Since tests can be added and removed from the layout tests at
-    # any time, this is not going to give exact coverage, but it will allow us
-    # to continuously run small slices of the layout tests under valgrind rather
-    # than having to run all of them in one shot.
-    chunk_size = self._options.num_tests
-    if chunk_size == 0 or len(self._args):
-      return self.TestLayoutChunk(0, 0)
-    chunk_num = 0
-    chunk_file = os.path.join("valgrind_layout_chunk.txt")
-    logging.info("Reading state from " + chunk_file)
-    try:
-      f = open(chunk_file)
-      if f:
-        chunk_str = f.read()
-        if len(chunk_str):
-          chunk_num = int(chunk_str)
-        # This should be enough so that we have a couple of complete runs
-        # of test data stored in the archive (although note that when we loop
-        # that we almost guaranteed won't be at the end of the test list)
-        if chunk_num > 10000:
-          chunk_num = 0
-        f.close()
-    except IOError, (errno, strerror):
-      logging.error("error reading from file %s (%d, %s)" % (chunk_file,
-                    errno, strerror))
-    # Save the new chunk size before running the tests. Otherwise if a
-    # particular chunk hangs the bot, the chunk number will never get
-    # incremented and the bot will be wedged.
-    logging.info("Saving state to " + chunk_file)
-    try:
-      f = open(chunk_file, "w")
-      chunk_num += 1
-      f.write("%d" % chunk_num)
-      f.close()
-    except IOError, (errno, strerror):
-      logging.error("error writing to file %s (%d, %s)" % (chunk_file, errno,
-                    strerror))
-    # Since we're running small chunks of the layout tests, it's important to
-    # mark the ones that have errors in them.  These won't be visible in the
-    # summary list for long, but will be useful for someone reviewing this bot.
-    return self.TestLayoutChunk(chunk_num, chunk_size)
-
-  # The known list of tests.
-  # Recognise the original abbreviations as well as full executable names.
-  _test_list = {
-    "cmdline" : RunCmdLine,
-    "addressinput": TestAddressInput,
-    "libaddressinput_unittests": TestAddressInput,
-    "accessibility": TestAccessibility,
-    "angle": TestAngle,          "angle_unittests": TestAngle,
-    "app_list": TestAppList,     "app_list_unittests": TestAppList,
-    "ash": TestAsh,              "ash_unittests": TestAsh,
-    "aura": TestAura,            "aura_unittests": TestAura,
-    "base": TestBase,            "base_unittests": TestBase,
-    "blink_heap": TestBlinkHeap,
-    "blink_platform": TestBlinkPlatform,
-    "browser": TestBrowser,      "browser_tests": TestBrowser,
-    "cacheinvalidation": TestCacheInvalidation,
-    "cacheinvalidation_unittests": TestCacheInvalidation,
-    "cast": TestCast,            "cast_unittests": TestCast,
-    "cc": TestCC,                "cc_unittests": TestCC,
-    "chrome_app": TestChromeApp,
-    "chrome_elf": TestChromeElf,
-    "chromedriver": TestChromeDriver,
-    "chromeos": TestChromeOS,    "chromeos_unittests": TestChromeOS,
-    "components": TestComponents,"components_unittests": TestComponents,
-    "compositor": TestCompositor,"compositor_unittests": TestCompositor,
-    "content": TestContent,      "content_unittests": TestContent,
-    "content_browsertests": TestContentBrowser,
-    "courgette": TestCourgette,  "courgette_unittests": TestCourgette,
-    "crypto": TestCrypto,        "crypto_unittests": TestCrypto,
-    "device": TestDevice,        "device_unittests": TestDevice,
-    "display": TestDisplay,      "display_unittests": TestDisplay,
-    "events": TestEvents,        "events_unittests": TestEvents,
-    "extensions": TestExtensions, "extensions_unittests": TestExtensions,
-    "ffmpeg_regression_tests": TestFFmpegRegressions,
-    "gcm": TestGCM,              "gcm_unit_tests": TestGCM,
-    "gin": TestGin,              "gin_unittests": TestGin,
-    "gfx": TestGfx,              "gfx_unittests": TestGfx,
-    "google_apis": TestGoogleApis,
-    "gpu": TestGPU,              "gpu_unittests": TestGPU,
-    "ipc": TestIpc,              "ipc_tests": TestIpc,
-    "installer_util": TestInstallerUtil,
-    "installer_util_unittests": TestInstallerUtil,
-    "install_static_unittests": TestInstallStatic,
-    "interactive_ui": TestInteractiveUI,
-    "jingle": TestJingle,        "jingle_unittests": TestJingle,
-    "keyboard": TestKeyboard,    "keyboard_unittests": TestKeyboard,
-    "latency": TestLatency,      "latency_unittests": TestLatency,
-    "layout": TestLayout,        "layout_tests": TestLayout,
-    "media": TestMedia,          "media_unittests": TestMedia,
-    "message_center": TestMessageCenter,
-    "message_center_unittests" : TestMessageCenter,
-    "midi": TestMidi,             "midi_unittests": TestMidi,
-    "mojo_common": TestMojoCommon,
-    "mojo_common_unittests": TestMojoCommon,
-    "mojo_system": TestMojoSystem,
-    "mojo_system_unittests": TestMojoSystem,
-    "mojo_public_system": TestMojoPublicSystem,
-    "mojo_public_system_unittests": TestMojoPublicSystem,
-    "mojo_public_bindings": TestMojoPublicBindings,
-    "mojo_public_bindings_unittests": TestMojoPublicBindings,
-    "mojo_public_sysperf": TestMojoPublicSysPerf,
-    "net": TestNet,              "net_unittests": TestNet,
-    "net_perf": TestNetPerf,     "net_perftests": TestNetPerf,
-    "phonenumber": TestPhoneNumber,
-    "libphonenumber_unittests": TestPhoneNumber,
-    "ppapi": TestPPAPI,          "ppapi_unittests": TestPPAPI,
-    "printing": TestPrinting,    "printing_unittests": TestPrinting,
-    "remoting": TestRemoting,    "remoting_unittests": TestRemoting,
-    "sandbox": TestLinuxSandbox, "sandbox_linux_unittests": TestLinuxSandbox,
-    "skia": TestSkia,            "skia_unittests": TestSkia,
-    "sql": TestSql,              "sql_unittests": TestSql,
-    "storage": TestStorage,      "storage_unittests": TestStorage,
-    "sync_integration_tests": TestSyncIntegration,
-    "sync_integration": TestSyncIntegration,
-    "ui_base_unit": TestUIBaseUnit,       "ui_base_unittests": TestUIBaseUnit,
-    "ui_chromeos": TestUIChromeOS, "ui_chromeos_unittests": TestUIChromeOS,
-    "unit": TestUnit,            "unit_tests": TestUnit,
-    "url": TestURL,              "url_unittests": TestURL,
-    "views": TestViews,          "views_unittests": TestViews,
-    "webkit": TestLayout,
-  }
-
-
-def _main():
-  parser = optparse.OptionParser("usage: %prog -b <dir> -t <test> "
-                                 "[-t <test> ...]")
-
-  parser.add_option("--help-tests", dest="help_tests", action="store_true",
-                    default=False, help="List all available tests")
-  parser.add_option("-b", "--build-dir",
-                    help="the location of the compiler output")
-  parser.add_option("--target", help="Debug or Release")
-  parser.add_option("-t", "--test", action="append", default=[],
-                    help="which test to run, supports test:gtest_filter format "
-                         "as well.")
-  parser.add_option("--baseline", action="store_true", default=False,
-                    help="generate baseline data instead of validating")
-  parser.add_option("-f", "--force", action="store_true", default=False,
-                    help="run a broken test anyway")
-  parser.add_option("--gtest_filter",
-                    help="additional arguments to --gtest_filter")
-  parser.add_option("--gtest_repeat", help="argument for --gtest_repeat")
-  parser.add_option("--gtest_shuffle", action="store_true", default=False,
-                    help="Randomize tests' orders on every iteration.")
-  parser.add_option("--gtest_break_on_failure", action="store_true",
-                    default=False,
-                    help="Drop in to debugger on assertion failure. Also "
-                         "useful for forcing tests to exit with a stack dump "
-                         "on the first assertion failure when running with "
-                         "--gtest_repeat=-1")
-  parser.add_option("-v", "--verbose", action="store_true", default=False,
-                    help="verbose output - enable debug log messages")
-  parser.add_option("--tool", dest="valgrind_tool", default="memcheck",
-                    help="specify a valgrind tool to run the tests under")
-  parser.add_option("--tool_flags", dest="valgrind_tool_flags", default="",
-                    help="specify custom flags for the selected valgrind tool")
-  parser.add_option("--keep_logs", action="store_true", default=False,
-                    help="store memory tool logs in the <tool>.logs directory "
-                         "instead of /tmp.\nThis can be useful for tool "
-                         "developers/maintainers.\nPlease note that the <tool>"
-                         ".logs directory will be clobbered on tool startup.")
-  parser.add_option("-n", "--num_tests", type="int",
-                    default=ChromeTests.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE,
-                    help="for layout tests: # of subtests per run.  0 for all.")
-  parser.add_option("--test-launcher-bot-mode", action="store_true",
-                    help="run the tests with --test-launcher-bot-mode")
-  parser.add_option("--test-launcher-total-shards", type=int,
-                    help="run the tests with --test-launcher-total-shards")
-  parser.add_option("--test-launcher-shard-index", type=int,
-                    help="run the tests with --test-launcher-shard-index")
-  parser.add_option("--drmemory_ops",
-                    help="extra options passed to Dr. Memory")
-
-  options, args = parser.parse_args()
-
-  # Bake target into build_dir.
-  if options.target and options.build_dir:
-    assert (options.target !=
-            os.path.basename(os.path.dirname(options.build_dir)))
-    options.build_dir = os.path.join(os.path.abspath(options.build_dir),
-                                     options.target)
-
-  if options.verbose:
-    logging_utils.config_root(logging.DEBUG)
-  else:
-    logging_utils.config_root()
-
-  if options.help_tests:
-    ChromeTests.ShowTests()
-    return 0
-
-  if not options.test:
-    parser.error("--test not specified")
-
-  if len(options.test) != 1 and options.gtest_filter:
-    parser.error("--gtest_filter and multiple tests don't make sense together")
-
-  BROKEN_TESTS = {
-    'drmemory_light': [
-      'addressinput',
-      'aura',
-      'base_unittests',
-      'cc',
-      'components', # x64 only?
-      'content',
-      'gfx',
-      'mojo_public_bindings',
-    ],
-    'drmemory_full': [
-      'addressinput',
-      'aura',
-      'base_unittests',
-      'blink_heap',
-      'blink_platform',
-      'browser_tests',
-      'cast',
-      'cc',
-      'chromedriver',
-      'compositor',
-      'content',
-      'content_browsertests',
-      'device',
-      'events',
-      'extensions',
-      'gfx',
-      'google_apis',
-      'gpu',
-      'ipc_tests',
-      'jingle',
-      'keyboard',
-      'media',
-      'midi',
-      'mojo_common',
-      'mojo_public_bindings',
-      'mojo_public_sysperf',
-      'mojo_public_system',
-      'mojo_system',
-      'net',
-      'remoting',
-      'unit',
-      'url',
-    ],
-  }
-
-  for t in options.test:
-    if t in BROKEN_TESTS[options.valgrind_tool] and not options.force:
-      logging.info("Skipping broken %s test %s -- see crbug.com/633693" %
-                   (options.valgrind_tool, t))
-      return 0
-
-    tests = ChromeTests(options, args, t)
-    ret = tests.Run()
-    if ret: return ret
-  return 0
-
-
-if __name__ == "__main__":
-  sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/chrome_tests.sh b/files/tools_libyuv/valgrind/chrome_tests.sh
deleted file mode 100755
index dc17684f..00000000
--- a/files/tools_libyuv/valgrind/chrome_tests.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Set up some paths and re-direct the arguments to chrome_tests.py
-
-export THISDIR=`dirname $0`
-ARGV_COPY="$@"
-
-# We need to set CHROME_VALGRIND iff using Memcheck:
-#   tools/valgrind/chrome_tests.sh --tool memcheck
-# or
-#   tools/valgrind/chrome_tests.sh --tool=memcheck
-tool="memcheck"  # Default to memcheck.
-while (( "$#" ))
-do
-  if [[ "$1" == "--tool" ]]
-  then
-    tool="$2"
-    shift
-  elif [[ "$1" =~ --tool=(.*) ]]
-  then
-    tool="${BASH_REMATCH[1]}"
-  fi
-  shift
-done
-
-NEEDS_VALGRIND=0
-NEEDS_DRMEMORY=0
-
-case "$tool" in
-  "memcheck")
-    NEEDS_VALGRIND=1
-    ;;
-  "drmemory" | "drmemory_light" | "drmemory_full" | "drmemory_pattern")
-    NEEDS_DRMEMORY=1
-    ;;
-esac
-
-if [ "$NEEDS_VALGRIND" == "1" ]
-then
-  export CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
-  if [ "$CHROME_VALGRIND" = "" ]
-  then
-    # locate_valgrind.sh failed
-    exit 1
-  fi
-  echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
-  PATH="${CHROME_VALGRIND}/bin:$PATH"
-  # We need to set these variables to override default lib paths hard-coded into
-  # Valgrind binary.
-  export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
-  export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
-  # Clean up some /tmp directories that might be stale due to interrupted
-  # chrome_tests.py execution.
-  # FYI:
-  #   -mtime +1  <- only print files modified more than 24h ago,
-  #   -print0/-0 are needed to handle possible newlines in the filenames.
-  echo "Cleanup /tmp from Valgrind stuff"
-  find /tmp -maxdepth 1 \(\
-        -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
-      \) -mtime +1 -print0 | xargs -0 rm -rf
-fi
-
-if [ "$NEEDS_DRMEMORY" == "1" ]
-then
-  if [ -z "$DRMEMORY_COMMAND" ]
-  then
-    DRMEMORY_PATH="$THISDIR/../../third_party/drmemory"
-    DRMEMORY_SFX="$DRMEMORY_PATH/drmemory-windows-sfx.exe"
-    if [ ! -f "$DRMEMORY_SFX" ]
-    then
-      echo "Can't find Dr. Memory executables."
-      echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-      echo "for the instructions on how to get them."
-      exit 1
-    fi
-
-    chmod +x "$DRMEMORY_SFX"  # Cygwin won't run it without +x.
-    "$DRMEMORY_SFX" -o"$DRMEMORY_PATH/unpacked" -y
-    export DRMEMORY_COMMAND="$DRMEMORY_PATH/unpacked/bin/drmemory.exe"
-  fi
-fi
-
-PYTHONPATH=$THISDIR/../python/google python \
-           "$THISDIR/chrome_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/common.py b/files/tools_libyuv/valgrind/common.py
deleted file mode 100644
index e9ee51e4..00000000
--- a/files/tools_libyuv/valgrind/common.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-import logging
-import platform
-import os
-import signal
-import subprocess
-import sys
-import time
-
-
-class NotImplementedError(Exception):
-  pass
-
-
-class TimeoutError(Exception):
-  pass
-
-
-def RunSubprocessInBackground(proc):
-  """Runs a subprocess in the background. Returns a handle to the process."""
-  logging.info("running %s in the background" % " ".join(proc))
-  return subprocess.Popen(proc)
-
-
-def RunSubprocess(proc, timeout=0):
-  """ Runs a subprocess, until it finishes or |timeout| is exceeded and the
-  process is killed with taskkill.  A |timeout| <= 0  means no timeout.
-
-  Args:
-    proc: list of process components (exe + args)
-    timeout: how long to wait before killing, <= 0 means wait forever
-  """
-
-  logging.info("running %s, timeout %d sec" % (" ".join(proc), timeout))
-  sys.stdout.flush()
-  sys.stderr.flush()
-
-  # Manually read and print out stdout and stderr.
-  # By default, the subprocess is supposed to inherit these from its parent,
-  # however when run under buildbot, it seems unable to read data from a
-  # grandchild process, so we have to read the child and print the data as if
-  # it came from us for buildbot to read it.  We're not sure why this is
-  # necessary.
-  # TODO(erikkay): should we buffer stderr and stdout separately?
-  p = subprocess.Popen(proc, universal_newlines=True,
-                       bufsize=0,  # unbuffered
-                       stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-
-  logging.info("started subprocess")
-
-  did_timeout = False
-  if timeout > 0:
-    wait_until = time.time() + timeout
-  while p.poll() is None and not did_timeout:
-    # Have to use readline rather than readlines() or "for line in p.stdout:",
-    # otherwise we get buffered even with bufsize=0.
-    line = p.stdout.readline()
-    while line and not did_timeout:
-      sys.stdout.write(line)
-      sys.stdout.flush()
-      line = p.stdout.readline()
-      if timeout > 0:
-        did_timeout = time.time() > wait_until
-
-  if did_timeout:
-    logging.info("process timed out")
-  else:
-    logging.info("process ended, did not time out")
-
-  if did_timeout:
-    if IsWindows():
-      subprocess.call(["taskkill", "/T", "/F", "/PID", str(p.pid)])
-    else:
-      # Does this kill all children, too?
-      os.kill(p.pid, signal.SIGINT)
-    logging.error("KILLED %d" % p.pid)
-    # Give the process a chance to actually die before continuing
-    # so that cleanup can happen safely.
-    time.sleep(1.0)
-    logging.error("TIMEOUT waiting for %s" % proc[0])
-    raise TimeoutError(proc[0])
-  else:
-    for line in p.stdout:
-      sys.stdout.write(line)
-    if not IsMac():   # stdout flush fails on Mac
-      logging.info("flushing stdout")
-      sys.stdout.flush()
-
-  logging.info("collecting result code")
-  result = p.poll()
-  if result:
-    logging.error("%s exited with non-zero result code %d" % (proc[0], result))
-  return result
-
-
-def IsLinux():
-  return sys.platform.startswith('linux')
-
-
-def IsMac():
-  return sys.platform.startswith('darwin')
-
-
-def IsWindows():
-  return sys.platform == 'cygwin' or sys.platform.startswith('win')
-
-
-def WindowsVersionName():
-  """Returns the name of the Windows version if it is known, or None.
-
-  Possible return values are: xp, vista, 7, 8, or None
-  """
-  if sys.platform == 'cygwin':
-    # Windows version number is hiding in system name.  Looks like:
-    # CYGWIN_NT-6.1-WOW64
-    try:
-      version_str = platform.uname()[0].split('-')[1]
-    except:
-      return None
-  elif sys.platform.startswith('win'):
-    # Normal Windows version string.  Mine: 6.1.7601
-    version_str = platform.version()
-  else:
-    return None
-
-  parts = version_str.split('.')
-  try:
-    major = int(parts[0])
-    minor = int(parts[1])
-  except:
-    return None  # Can't parse, unknown version.
-
-  if major == 5:
-    return 'xp'
-  elif major == 6 and minor == 0:
-    return 'vista'
-  elif major == 6 and minor == 1:
-    return '7'
-  elif major == 6 and minor == 2:
-    return '8'  # Future proof.  ;)
-  return None
-
-
-def PlatformNames():
-  """Return an array of string to be used in paths for the platform
-  (e.g. suppressions, gtest filters, ignore files etc.)
-  The first element of the array describes the 'main' platform
-  """
-  if IsLinux():
-    return ['linux']
-  if IsMac():
-    return ['mac']
-  if IsWindows():
-    names = ['win32']
-    version_name = WindowsVersionName()
-    if version_name is not None:
-      names.append('win-%s' % version_name)
-    return names
-  raise NotImplementedError('Unknown platform "%s".' % sys.platform)
-
-
-def PutEnvAndLog(env_name, env_value):
-  os.putenv(env_name, env_value)
-  logging.info('export %s=%s', env_name, env_value)
-
-def BoringCallers(mangled, use_re_wildcards):
-  """Return a list of 'boring' function names (optinally mangled)
-  with */? wildcards (optionally .*/.).
-  Boring = we drop off the bottom of stack traces below such functions.
-  """
-
-  need_mangling = [
-    # Don't show our testing framework:
-    ("testing::Test::Run",     "_ZN7testing4Test3RunEv"),
-    ("testing::TestInfo::Run", "_ZN7testing8TestInfo3RunEv"),
-    ("testing::internal::Handle*ExceptionsInMethodIfSupported*",
-     "_ZN7testing8internal3?Handle*ExceptionsInMethodIfSupported*"),
-
-    # Depend on scheduling:
-    ("MessageLoop::Run",     "_ZN11MessageLoop3RunEv"),
-    ("MessageLoop::RunTask", "_ZN11MessageLoop7RunTask*"),
-    ("RunnableMethod*",      "_ZN14RunnableMethod*"),
-    ("DispatchToMethod*",    "_Z*16DispatchToMethod*"),
-    ("base::internal::Invoker*::DoInvoke*",
-     "_ZN4base8internal8Invoker*DoInvoke*"),  # Invoker{1,2,3}
-    ("base::internal::RunnableAdapter*::Run*",
-     "_ZN4base8internal15RunnableAdapter*Run*"),
-  ]
-
-  ret = []
-  for pair in need_mangling:
-    ret.append(pair[1 if mangled else 0])
-
-  ret += [
-    # Also don't show the internals of libc/pthread.
-    "start_thread",
-    "main",
-    "BaseThreadInitThunk",
-  ]
-
-  if use_re_wildcards:
-    for i in range(0, len(ret)):
-      ret[i] = ret[i].replace('*', '.*').replace('?', '.')
-
-  return ret
-
-def NormalizeWindowsPath(path):
-  """If we're using Cygwin Python, turn the path into a Windows path.
-
-  Don't turn forward slashes into backslashes for easier copy-pasting and
-  escaping.
-
-  TODO(rnk): If we ever want to cut out the subprocess invocation, we can use
-  _winreg to get the root Cygwin directory from the registry key:
-  HKEY_LOCAL_MACHINE\SOFTWARE\Cygwin\setup\rootdir.
-  """
-  if sys.platform.startswith("cygwin"):
-    p = subprocess.Popen(["cygpath", "-m", path],
-                         stdout=subprocess.PIPE,
-                         stderr=subprocess.PIPE)
-    (out, err) = p.communicate()
-    if err:
-      logging.warning("WARNING: cygpath error: %s", err)
-    return out.strip()
-  else:
-    return path
-
-############################
-# Common output format code
-
-def PrintUsedSuppressionsList(suppcounts):
-  """ Prints out the list of used suppressions in a format common to all the
-      memory tools. If the list is empty, prints nothing and returns False,
-      otherwise True.
-
-      suppcounts: a dictionary of used suppression counts,
-                  Key -> name, Value -> count.
-  """
-  if not suppcounts:
-    return False
-
-  print "-----------------------------------------------------"
-  print "Suppressions used:"
-  print "  count name"
-  for (name, count) in sorted(suppcounts.items(), key=lambda (k,v): (v,k)):
-    print "%7d %s" % (count, name)
-  print "-----------------------------------------------------"
-  sys.stdout.flush()
-  return True
diff --git a/files/tools_libyuv/valgrind/gdb_helper.py b/files/tools_libyuv/valgrind/gdb_helper.py
deleted file mode 100644
index d127f760..00000000
--- a/files/tools_libyuv/valgrind/gdb_helper.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-''' A bunch of helper functions for querying gdb.'''
-
-import logging
-import os
-import re
-import tempfile
-
-GDB_LINE_RE = re.compile(r'Line ([0-9]*) of "([^"]*)".*')
-
-def _GdbOutputToFileLine(output_line):
-  ''' Parse the gdb output line, return a pair (file, line num) '''
-  match =  GDB_LINE_RE.match(output_line)
-  if match:
-    return match.groups()[1], match.groups()[0]
-  else:
-    return None
-
-def ResolveAddressesWithinABinary(binary_name, load_address, address_list):
-  ''' For each address, return a pair (file, line num) '''
-  commands = tempfile.NamedTemporaryFile()
-  commands.write('add-symbol-file "%s" %s\n' % (binary_name, load_address))
-  for addr in address_list:
-    commands.write('info line *%s\n' % addr)
-  commands.write('quit\n')
-  commands.flush()
-  gdb_commandline = 'gdb -batch -x %s 2>/dev/null' % commands.name
-  gdb_pipe = os.popen(gdb_commandline)
-  result = gdb_pipe.readlines()
-
-  address_count = 0
-  ret = {}
-  for line in result:
-    if line.startswith('Line'):
-      ret[address_list[address_count]] = _GdbOutputToFileLine(line)
-      address_count += 1
-    if line.startswith('No line'):
-      ret[address_list[address_count]] = (None, None)
-      address_count += 1
-  gdb_pipe.close()
-  commands.close()
-  return ret
-
-class AddressTable(object):
-  ''' Object to do batched line number lookup. '''
-  def __init__(self):
-    self._load_addresses = {}
-    self._binaries = {}
-    self._all_resolved = False
-
-  def AddBinaryAt(self, binary, load_address):
-    ''' Register a new shared library or executable. '''
-    self._load_addresses[binary] = load_address
-
-  def Add(self, binary, address):
-    ''' Register a lookup request. '''
-    if binary == '':
-      logging.warn('adding address %s in empty binary?' % address)
-    if binary in self._binaries:
-      self._binaries[binary].append(address)
-    else:
-      self._binaries[binary] = [address]
-    self._all_resolved = False
-
-  def ResolveAll(self):
-    ''' Carry out all lookup requests. '''
-    self._translation = {}
-    for binary in self._binaries.keys():
-      if binary != '' and binary in self._load_addresses:
-        load_address = self._load_addresses[binary]
-        addr = ResolveAddressesWithinABinary(
-            binary, load_address, self._binaries[binary])
-        self._translation[binary] = addr
-    self._all_resolved = True
-
-  def GetFileLine(self, binary, addr):
-    ''' Get the (filename, linenum) result of a previously-registered lookup
-    request.
-    '''
-    if self._all_resolved:
-      if binary in self._translation:
-        if addr in self._translation[binary]:
-          return self._translation[binary][addr]
-    return (None, None)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.bat b/files/tools_libyuv/valgrind/libyuv_tests.bat
deleted file mode 100644
index 5fceca67..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.bat
+++ /dev/null
@@ -1,79 +0,0 @@
-@echo off
-:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-::
-:: Use of this source code is governed by a BSD-style license
-:: that can be found in the LICENSE file in the root of the source
-:: tree. An additional intellectual property rights grant can be found
-:: in the file PATENTS.  All contributing project authors may
-:: be found in the AUTHORS file in the root of the source tree.
-
-:: This script is a copy of chrome_tests.bat with the following changes:
-:: - Invokes libyuv_tests.py instead of chrome_tests.py
-:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
-::   it possible to execute the Python scripts properly.
-
-:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
-set THISDIR=%~dp0
-set TOOL_NAME="unknown"
-
-:: Get the tool name and put it into TOOL_NAME {{{1
-:: NB: SHIFT command doesn't modify %*
-:PARSE_ARGS_LOOP
-  if %1 == () GOTO:TOOLNAME_NOT_FOUND
-  if %1 == --tool GOTO:TOOLNAME_FOUND
-  SHIFT
-  goto :PARSE_ARGS_LOOP
-
-:TOOLNAME_NOT_FOUND
-echo "Please specify a tool (tsan or drmemory) by using --tool flag"
-exit /B 1
-
-:TOOLNAME_FOUND
-SHIFT
-set TOOL_NAME=%1
-:: }}}
-if "%TOOL_NAME%" == "drmemory"          GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_light"    GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_full"     GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_pattern"  GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "tsan"     GOTO :SETUP_TSAN
-echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
-exit /B 1
-
-:SETUP_DRMEMORY
-if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
-:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
-set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
-set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
-if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
-echo "Can't find Dr. Memory executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:DRMEMORY_BINARY_OK
-%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
-set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
-:: }}}
-goto :RUN_TESTS
-
-:SETUP_TSAN
-:: Set up PIN_COMMAND to invoke TSan {{{1
-set TSAN_PATH=%THISDIR%..\..\third_party\tsan
-set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
-if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
-echo "Can't find ThreadSanitizer executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:TSAN_BINARY_OK
-%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
-set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
-:: }}}
-goto :RUN_TESTS
-
-:RUN_TESTS
-set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
-set RUNNING_ON_VALGRIND=yes
-python %THISDIR%libyuv_tests.py %*
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.py b/files/tools_libyuv/valgrind/libyuv_tests.py
deleted file mode 100755
index e780bd95..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Runs various libyuv tests through valgrind_test.py.
-
-This script inherits the chrome_tests.py in Chrome, but allows running any test
-instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
-only supports specifying a single test for each run.
-
-Suppression files:
-The Chrome valgrind directory we use as a DEPS dependency contains the following
-suppression files:
-  valgrind/memcheck/suppressions.txt
-  valgrind/memcheck/suppressions_mac.txt
-  valgrind/tsan/suppressions.txt
-  valgrind/tsan/suppressions_mac.txt
-  valgrind/tsan/suppressions_win32.txt
-Since they're referenced from the chrome_tests.py script, we have similar files
-below the directory of this script. When executing, this script will setup both
-Chrome's suppression files and our own, so we can easily maintain libyuv
-specific suppressions in our own files.
-"""
-
-import logging
-import optparse
-import os
-import sys
-
-import logging_utils
-import path_utils
-
-import chrome_tests
-
-
-class LibyuvTest(chrome_tests.ChromeTests):
-  """Class that handles setup of suppressions for libyuv.
-
-  Everything else is inherited from chrome_tests.ChromeTests.
-  """
-
-  def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
-    """Override command-building method so we can add more suppressions."""
-    cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
-                                                   valgrind_test_args)
-    # When ChromeTests._DefaultCommand has executed, it has setup suppression
-    # files based on what's found in the memcheck/ or tsan/ subdirectories of
-    # this script's location. If Mac or Windows is executing, additional
-    # platform specific files have also been added.
-    # Since only the ones located below this directory is added, we must also
-    # add the ones maintained by Chrome, located in ../../tools/valgrind.
-
-    # The idea is to look for --suppression arguments in the cmd list and add a
-    # modified copy of each suppression file, for the corresponding file in
-    # ../../tools/valgrind.
-    script_dir = path_utils.ScriptDir()
-    old_base, _ = os.path.split(script_dir)
-
-    checkout_src = os.path.abspath(os.path.join(script_dir, os.pardir,
-                                                os.pardir))
-    new_dir = os.path.join(checkout_src, 'tools', 'valgrind')
-    add_suppressions = []
-    for token in cmd:
-      if '--suppressions' in token:
-        add_suppressions.append(token.replace(script_dir, new_dir))
-    return add_suppressions + cmd
-
-
-def main(_):
-  parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
-  parser.disable_interspersed_args()
-  parser.add_option('-b', '--build-dir',
-                    help=('Location of the compiler output. Can only be used '
-                          'when the test argument does not contain this path.'))
-  parser.add_option("--target", help="Debug or Release")
-  parser.add_option('-t', '--test', help='Test to run.')
-  parser.add_option('', '--baseline', action='store_true', default=False,
-                    help='Generate baseline data instead of validating')
-  parser.add_option('', '--gtest_filter',
-                    help='Additional arguments to --gtest_filter')
-  parser.add_option('', '--gtest_repeat',
-                    help='Argument for --gtest_repeat')
-  parser.add_option("--gtest_shuffle", action="store_true", default=False,
-                    help="Randomize tests' orders on every iteration.")
-  parser.add_option("--gtest_break_on_failure", action="store_true",
-                    default=False,
-                    help="Drop in to debugger on assertion failure. Also "
-                         "useful for forcing tests to exit with a stack dump "
-                         "on the first assertion failure when running with "
-                         "--gtest_repeat=-1")
-  parser.add_option('-v', '--verbose', action='store_true', default=False,
-                    help='Verbose output - enable debug log messages')
-  parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
-                    help='Specify a valgrind tool to run the tests under')
-  parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
-                    help='Specify custom flags for the selected valgrind tool')
-  parser.add_option('', '--keep_logs', action='store_true', default=False,
-                    help=('Store memory tool logs in the <tool>.logs directory '
-                          'instead of /tmp.\nThis can be useful for tool '
-                          'developers/maintainers.\nPlease note that the <tool>'
-                          '.logs directory will be clobbered on tool startup.'))
-  parser.add_option("--test-launcher-bot-mode", action="store_true",
-                    help="run the tests with --test-launcher-bot-mode")
-  parser.add_option("--test-launcher-total-shards", type=int,
-                    help="run the tests with --test-launcher-total-shards")
-  parser.add_option("--test-launcher-shard-index", type=int,
-                    help="run the tests with --test-launcher-shard-index")
-  options, args = parser.parse_args()
-
-  if options.verbose:
-    logging_utils.config_root(logging.DEBUG)
-  else:
-    logging_utils.config_root()
-
-  if not options.test:
-    parser.error('--test not specified')
-
-  # Support build dir both with and without the target.
-  if (options.target and options.build_dir and
-      not options.build_dir.endswith(options.target)):
-    options.build_dir = os.path.join(options.build_dir, options.target)
-
-  # If --build_dir is provided, prepend it to the test executable if needed.
-  test_executable = options.test
-  if options.build_dir and not test_executable.startswith(options.build_dir):
-    test_executable = os.path.join(options.build_dir, test_executable)
-  args = [test_executable] + args
-
-  test = LibyuvTest(options, args, 'cmdline')
-  return test.Run()
-
-if __name__ == '__main__':
-  return_code = main(sys.argv)
-  sys.exit(return_code)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.sh b/files/tools_libyuv/valgrind/libyuv_tests.sh
deleted file mode 100755
index 249032ca..00000000
--- a/files/tools_libyuv/valgrind/libyuv_tests.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Set up some paths and re-direct the arguments to libyuv_tests.py
-
-# This script is a copy of the chrome_tests.sh wrapper script with the following
-# changes:
-# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
-#   the Valgrind framework install. If it fails a fallback path is used instead
-#   (../../chromium/src/third_party/valgrind/linux_x64) and a warning message
-#   is showed by |show_locate_valgrind_failed_warning|.
-# - libyuv_tests.py is invoked instead of chrome_tests.py.
-# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
-#   possible to execute the Python scripts properly.
-
-export THISDIR=`dirname $0`
-ARGV_COPY="$@"
-
-# We need to set CHROME_VALGRIND iff using Memcheck:
-#   tools_libyuv/valgrind/libyuv_tests.sh --tool memcheck
-# or
-#   tools_libyuv/valgrind/libyuv_tests.sh --tool=memcheck
-tool="memcheck"  # Default to memcheck.
-while (( "$#" ))
-do
-  if [[ "$1" == "--tool" ]]
-  then
-    tool="$2"
-    shift
-  elif [[ "$1" =~ --tool=(.*) ]]
-  then
-    tool="${BASH_REMATCH[1]}"
-  fi
-  shift
-done
-
-NEEDS_VALGRIND=0
-
-case "$tool" in
-  "memcheck")
-    NEEDS_VALGRIND=1
-    ;;
-esac
-
-# For libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
-# scripts dir to locate the Valgrind framework install
-CHROME_VALGRIND_SCRIPTS=$THISDIR/../../tools/valgrind
-
-if [ "$NEEDS_VALGRIND" == "1" ]
-then
-  CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
-  if [ "$CHROME_VALGRIND" = "" ]
-  then
-    CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64
-    echo
-    echo "-------------------- WARNING ------------------------"
-    echo "locate_valgrind.sh failed."
-    echo "Using $CHROME_VALGRIND as a fallback location."
-    echo "This might be because:"
-    echo "1) This is a swarming bot"
-    echo "2) You haven't set up the valgrind binaries correctly."
-    echo "In this case, please make sure you have followed the instructions at"
-    echo "http://www.chromium.org/developers/how-tos/using-valgrind/get-valgrind"
-    echo "Notice: In the .gclient file, you need to add this for the 'libyuv'"
-    echo "solution since our directory structure is different from Chromium's:"
-    echo "\"custom_deps\": {"
-    echo "  \"libyuv/third_party/valgrind\":"
-    echo "      \"https://chromium.googlesource.com/chromium/deps/valgrind/binaries\","
-    echo "},"
-    echo "-----------------------------------------------------"
-    echo
-  fi
-  echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
-  PATH="${CHROME_VALGRIND}/bin:$PATH"
-  # We need to set these variables to override default lib paths hard-coded into
-  # Valgrind binary.
-  export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
-  export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
-  # Clean up some /tmp directories that might be stale due to interrupted
-  # chrome_tests.py execution.
-  # FYI:
-  #   -mtime +1  <- only print files modified more than 24h ago,
-  #   -print0/-0 are needed to handle possible newlines in the filenames.
-  echo "Cleanup /tmp from Valgrind stuff"
-  find /tmp -maxdepth 1 \(\
-        -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
-      \) -mtime +1 -print0 | xargs -0 rm -rf
-fi
-
-# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
-# the scripts that are needed for this script to run
-PYTHONPATH=$THISDIR/../../tools/python/google:$CHROME_VALGRIND_SCRIPTS python \
-           "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/locate_valgrind.sh b/files/tools_libyuv/valgrind/locate_valgrind.sh
deleted file mode 100755
index d9594f48..00000000
--- a/files/tools_libyuv/valgrind/locate_valgrind.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# Prints a path to Valgrind binaries to be used for Chromium.
-# Select the valgrind from third_party/valgrind by default,
-# but allow users to override this default without editing scripts and
-# without specifying a commandline option
-
-export THISDIR=`dirname $0`
-
-# User may use their own valgrind by giving its path with CHROME_VALGRIND env.
-if [ "$CHROME_VALGRIND" = "" ]
-then
-  # Guess which binaries we should use by uname
-  case "$(uname -a)" in
-  *Linux*x86_64*)
-    PLATFORM="linux_x64"
-    ;;
-  *Linux*86*)
-    PLATFORM="linux_x86"
-    ;;
-  *Darwin*9.[678].[01]*i386*)
-    # Didn't test other kernels.
-    PLATFORM="mac"
-    ;;
-  *Darwin*10.[0-9].[0-9]*i386*)
-    PLATFORM="mac_10.6"
-    ;;
-  *Darwin*10.[0-9].[0-9]*x86_64*)
-    PLATFORM="mac_10.6"
-    ;;
-  *Darwin*11.[0-9].[0-9]*x86_64*)
-    PLATFORM="mac_10.7"
-    ;;
-  *)
-    (echo "Sorry, your platform is not supported:" &&
-     uname -a
-     echo
-     echo "If you're on Mac OS X, please see http://crbug.com/441425") >&2
-    exit 42
-  esac
-
-  # The binaries should be in third_party/valgrind
-  # (checked out from deps/third_party/valgrind/binaries).
-  CHROME_VALGRIND="$THISDIR/../../third_party/valgrind/$PLATFORM"
-
-  # TODO(timurrrr): readlink -f is not present on Mac...
-  if [ "$PLATFORM" != "mac" ] && \
-    [ "$PLATFORM" != "mac_10.6" ] && \
-    [ "$PLATFORM" != "mac_10.7" ]
-  then
-    # Get rid of all "../" dirs
-    CHROME_VALGRIND=$(readlink -f $CHROME_VALGRIND)
-  fi
-fi
-
-if ! test -x $CHROME_VALGRIND/bin/valgrind
-then
-  echo "Oops, could not find Valgrind binaries in your checkout." >&2
-  echo "Please see" >&2
-  echo "  http://dev.chromium.org/developers/how-tos/using-valgrind/get-valgrind" >&2
-  echo "for the instructions on how to download pre-built binaries." >&2
-  exit 1
-fi
-
-echo $CHROME_VALGRIND
diff --git a/files/tools_libyuv/valgrind/memcheck/OWNERS b/files/tools_libyuv/valgrind/memcheck/OWNERS
deleted file mode 100644
index 72e8ffc0..00000000
--- a/files/tools_libyuv/valgrind/memcheck/OWNERS
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py b/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
deleted file mode 100644
index 03329214..00000000
--- a/files/tools_libyuv/valgrind/memcheck/PRESUBMIT.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""
-Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
-
-See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
-for more details on the presubmit API built into gcl.
-"""
-
-import os
-import re
-import sys
-
-def CheckChange(input_api, output_api):
-  """Checks the memcheck suppressions files for bad data."""
-
-  # Add the path to the Chrome valgrind dir to the import path:
-  tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..', '..',
-                               'tools', 'valgrind')
-  sys.path.append(tools_vg_path)
-  import suppressions
-
-  sup_regex = re.compile('suppressions.*\.txt$')
-  suppressions = {}
-  errors = []
-  check_for_memcheck = False
-  # skip_next_line has 3 possible values:
-  # - False: don't skip the next line.
-  # - 'skip_suppression_name': the next line is a suppression name, skip.
-  # - 'skip_param': the next line is a system call parameter error, skip.
-  skip_next_line = False
-  for f in filter(lambda x: sup_regex.search(x.LocalPath()),
-                  input_api.AffectedFiles()):
-    for line, line_num in zip(f.NewContents(),
-                              xrange(1, len(f.NewContents()) + 1)):
-      line = line.lstrip()
-      if line.startswith('#') or not line:
-        continue
-
-      if skip_next_line:
-        if skip_next_line == 'skip_suppression_name':
-          if 'insert_a_suppression_name_here' in line:
-            errors.append('"insert_a_suppression_name_here" is not a valid '
-                          'suppression name')
-          if suppressions.has_key(line):
-            if f.LocalPath() == suppressions[line][1]:
-              errors.append('suppression with name "%s" at %s line %s '
-                            'has already been defined at line %s' %
-                            (line, f.LocalPath(), line_num,
-                             suppressions[line][1]))
-            else:
-              errors.append('suppression with name "%s" at %s line %s '
-                            'has already been defined at %s line %s' %
-                            (line, f.LocalPath(), line_num,
-                             suppressions[line][0], suppressions[line][1]))
-          else:
-            suppressions[line] = (f, line_num)
-            check_for_memcheck = True;
-        skip_next_line = False
-        continue
-      if check_for_memcheck:
-        if not line.startswith('Memcheck:'):
-          errors.append('"%s" should be "Memcheck:..." in %s line %s' %
-                        (line, f.LocalPath(), line_num))
-        check_for_memcheck = False;
-      if line == '{':
-        skip_next_line = 'skip_suppression_name'
-        continue
-      if line == "Memcheck:Param":
-        skip_next_line = 'skip_param'
-        continue
-
-      if (line.startswith('fun:') or line.startswith('obj:') or
-          line.startswith('Memcheck:') or line == '}' or
-          line == '...'):
-        continue
-      errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
-                                                            line_num))
-  if errors:
-    return [output_api.PresubmitError('\n'.join(errors))]
-  return []
-
-def CheckChangeOnUpload(input_api, output_api):
-  return CheckChange(input_api, output_api)
-
-def CheckChangeOnCommit(input_api, output_api):
-  return CheckChange(input_api, output_api)
-
-def GetPreferredTrySlaves():
-  # We don't have any memcheck slaves yet, so there's no use for this method.
-  # When we have, the slave name(s) should be put into this list.
-  return []
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions.txt b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
deleted file mode 100644
index 3f0f6d44..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-# There are two of suppressions in this file.
-# 1. third_party libraries
-# 2. libyuv stuff
-# 3. libjingle stuff (talk folder)
-#-----------------------------------------------------------------------
-
-# third_party libraries
-{
-   bug_729
-   Memcheck:Free
-   fun:_ZdaPv
-   ...
-   fun:_ZN7testing8internal12UnitTestImplD1Ev
-   ...
-}
-
-# libyuv (empty so far)
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
deleted file mode 100644
index 3ad0c8cc..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions_mac.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt b/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
deleted file mode 100644
index 3ad0c8cc..00000000
--- a/files/tools_libyuv/valgrind/memcheck/suppressions_win32.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file is used in addition to the one already maintained in Chrome.
-# It acts as a place holder for future additions for this project.
-# It must exist for the Python wrapper script to work properly.
-
-
diff --git a/files/tools_libyuv/valgrind/memcheck_analyze.py b/files/tools_libyuv/valgrind/memcheck_analyze.py
deleted file mode 100755
index 80e85eb4..00000000
--- a/files/tools_libyuv/valgrind/memcheck_analyze.py
+++ /dev/null
@@ -1,644 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# memcheck_analyze.py
-
-''' Given a valgrind XML file, parses errors and uniques them.'''
-
-import gdb_helper
-
-from collections import defaultdict
-import hashlib
-import logging
-import optparse
-import os
-import re
-import subprocess
-import sys
-import time
-from xml.dom.minidom import parse
-from xml.parsers.expat import ExpatError
-
-import common
-
-# Global symbol table (yuck)
-TheAddressTable = None
-
-# These are regexps that define functions (using C++ mangled names)
-# we don't want to see in stack traces while pretty printing
-# or generating suppressions.
-# Just stop printing the stack/suppression frames when the current one
-# matches any of these.
-_BORING_CALLERS = common.BoringCallers(mangled=True, use_re_wildcards=True)
-
-def getTextOf(top_node, name):
-  ''' Returns all text in all DOM nodes with a certain |name| that are children
-  of |top_node|.
-  '''
-
-  text = ""
-  for nodes_named in top_node.getElementsByTagName(name):
-    text += "".join([node.data for node in nodes_named.childNodes
-                     if node.nodeType == node.TEXT_NODE])
-  return text
-
-def getCDATAOf(top_node, name):
-  ''' Returns all CDATA in all DOM nodes with a certain |name| that are children
-  of |top_node|.
-  '''
-
-  text = ""
-  for nodes_named in top_node.getElementsByTagName(name):
-    text += "".join([node.data for node in nodes_named.childNodes
-                     if node.nodeType == node.CDATA_SECTION_NODE])
-  if (text == ""):
-    return None
-  return text
-
-def shortenFilePath(source_dir, directory):
-  '''Returns a string with the string prefix |source_dir| removed from
-  |directory|.'''
-  prefixes_to_cut = ["build/src/", "valgrind/coregrind/", "out/Release/../../"]
-
-  if source_dir:
-    prefixes_to_cut.append(source_dir)
-
-  for p in prefixes_to_cut:
-    index = directory.rfind(p)
-    if index != -1:
-      directory = directory[index + len(p):]
-
-  return directory
-
-# Constants that give real names to the abbreviations in valgrind XML output.
-INSTRUCTION_POINTER = "ip"
-OBJECT_FILE = "obj"
-FUNCTION_NAME = "fn"
-SRC_FILE_DIR = "dir"
-SRC_FILE_NAME = "file"
-SRC_LINE = "line"
-
-def gatherFrames(node, source_dir):
-  frames = []
-  for frame in node.getElementsByTagName("frame"):
-    frame_dict = {
-      INSTRUCTION_POINTER : getTextOf(frame, INSTRUCTION_POINTER),
-      OBJECT_FILE         : getTextOf(frame, OBJECT_FILE),
-      FUNCTION_NAME       : getTextOf(frame, FUNCTION_NAME),
-      SRC_FILE_DIR        : shortenFilePath(
-          source_dir, getTextOf(frame, SRC_FILE_DIR)),
-      SRC_FILE_NAME       : getTextOf(frame, SRC_FILE_NAME),
-      SRC_LINE            : getTextOf(frame, SRC_LINE)
-    }
-
-    # Ignore this frame and all the following if it's a "boring" function.
-    enough_frames = False
-    for regexp in _BORING_CALLERS:
-      if re.match("^%s$" % regexp, frame_dict[FUNCTION_NAME]):
-        enough_frames = True
-        break
-    if enough_frames:
-      break
-
-    frames += [frame_dict]
-
-    global TheAddressTable
-    if TheAddressTable != None and frame_dict[SRC_LINE] == "":
-      # Try using gdb
-      TheAddressTable.Add(frame_dict[OBJECT_FILE],
-                          frame_dict[INSTRUCTION_POINTER])
-  return frames
-
-class ValgrindError:
-  ''' Takes a <DOM Element: error> node and reads all the data from it. A
-  ValgrindError is immutable and is hashed on its pretty printed output.
-  '''
-
-  def __init__(self, source_dir, error_node, commandline, testcase):
-    ''' Copies all the relevant information out of the DOM and into object
-    properties.
-
-    Args:
-      error_node: The <error></error> DOM node we're extracting from.
-      source_dir: Prefix that should be stripped from the <dir> node.
-      commandline: The command that was run under valgrind
-      testcase: The test case name, if known.
-    '''
-
-    # Valgrind errors contain one <what><stack> pair, plus an optional
-    # <auxwhat><stack> pair, plus an optional <origin><what><stack></origin>,
-    # plus (since 3.5.0) a <suppression></suppression> pair.
-    # (Origin is nicely enclosed; too bad the other two aren't.)
-    # The most common way to see all three in one report is
-    # a syscall with a parameter that points to uninitialized memory, e.g.
-    # Format:
-    # <error>
-    #   <unique>0x6d</unique>
-    #   <tid>1</tid>
-    #   <kind>SyscallParam</kind>
-    #   <what>Syscall param write(buf) points to uninitialised byte(s)</what>
-    #   <stack>
-    #     <frame>
-    #     ...
-    #     </frame>
-    #   </stack>
-    #   <auxwhat>Address 0x5c9af4f is 7 bytes inside a block of ...</auxwhat>
-    #   <stack>
-    #     <frame>
-    #     ...
-    #     </frame>
-    #   </stack>
-    #   <origin>
-    #   <what>Uninitialised value was created by a heap allocation</what>
-    #   <stack>
-    #     <frame>
-    #     ...
-    #     </frame>
-    #   </stack>
-    #   </origin>
-    #   <suppression>
-    #     <sname>insert_a_suppression_name_here</sname>
-    #     <skind>Memcheck:Param</skind>
-    #     <skaux>write(buf)</skaux>
-    #     <sframe> <fun>__write_nocancel</fun> </sframe>
-    #     ...
-    #     <sframe> <fun>main</fun> </sframe>
-    #     <rawtext>
-    # <![CDATA[
-    # {
-    #    <insert_a_suppression_name_here>
-    #    Memcheck:Param
-    #    write(buf)
-    #    fun:__write_nocancel
-    #    ...
-    #    fun:main
-    # }
-    # ]]>
-    #     </rawtext>
-    #   </suppression>
-    # </error>
-    #
-    # Each frame looks like this:
-    #  <frame>
-    #    <ip>0x83751BC</ip>
-    #    <obj>/data/dkegel/chrome-build/src/out/Release/base_unittests</obj>
-    #    <fn>_ZN7testing8internal12TestInfoImpl7RunTestEPNS_8TestInfoE</fn>
-    #    <dir>/data/dkegel/chrome-build/src/testing/gtest/src</dir>
-    #    <file>gtest-internal-inl.h</file>
-    #    <line>655</line>
-    #  </frame>
-    # although the dir, file, and line elements are missing if there is
-    # no debug info.
-
-    self._kind = getTextOf(error_node, "kind")
-    self._backtraces = []
-    self._suppression = None
-    self._commandline = commandline
-    self._testcase = testcase
-    self._additional = []
-
-    # Iterate through the nodes, parsing <what|auxwhat><stack> pairs.
-    description = None
-    for node in error_node.childNodes:
-      if node.localName == "what" or node.localName == "auxwhat":
-        description = "".join([n.data for n in node.childNodes
-                              if n.nodeType == n.TEXT_NODE])
-      elif node.localName == "xwhat":
-        description = getTextOf(node, "text")
-      elif node.localName == "stack":
-        assert description
-        self._backtraces.append([description, gatherFrames(node, source_dir)])
-        description = None
-      elif node.localName == "origin":
-        description = getTextOf(node, "what")
-        stack = node.getElementsByTagName("stack")[0]
-        frames = gatherFrames(stack, source_dir)
-        self._backtraces.append([description, frames])
-        description = None
-        stack = None
-        frames = None
-      elif description and node.localName != None:
-        # The lastest description has no stack, e.g. "Address 0x28 is unknown"
-        self._additional.append(description)
-        description = None
-
-      if node.localName == "suppression":
-        self._suppression = getCDATAOf(node, "rawtext");
-
-  def __str__(self):
-    ''' Pretty print the type and backtrace(s) of this specific error,
-        including suppression (which is just a mangled backtrace).'''
-    output = ""
-    output += "\n" # Make sure the ### is at the beginning of line.
-    output += "### BEGIN MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
-        self.ErrorHash()
-    if (self._commandline):
-      output += self._commandline + "\n"
-
-    output += self._kind + "\n"
-    for backtrace in self._backtraces:
-      output += backtrace[0] + "\n"
-      filter = subprocess.Popen("c++filt -n", stdin=subprocess.PIPE,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.STDOUT,
-                                shell=True,
-                                close_fds=True)
-      buf = ""
-      for frame in backtrace[1]:
-        buf +=  (frame[FUNCTION_NAME] or frame[INSTRUCTION_POINTER]) + "\n"
-      (stdoutbuf, stderrbuf) = filter.communicate(buf.encode('latin-1'))
-      demangled_names = stdoutbuf.split("\n")
-
-      i = 0
-      for frame in backtrace[1]:
-        output += ("  " + demangled_names[i])
-        i = i + 1
-
-        global TheAddressTable
-        if TheAddressTable != None and frame[SRC_FILE_DIR] == "":
-           # Try using gdb
-           foo = TheAddressTable.GetFileLine(frame[OBJECT_FILE],
-                                             frame[INSTRUCTION_POINTER])
-           if foo[0] != None:
-             output += (" (" + foo[0] + ":" + foo[1] + ")")
-        elif frame[SRC_FILE_DIR] != "":
-          output += (" (" + frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME] +
-                     ":" + frame[SRC_LINE] + ")")
-        else:
-          output += " (" + frame[OBJECT_FILE] + ")"
-        output += "\n"
-
-    for additional in self._additional:
-      output += additional + "\n"
-
-    assert self._suppression != None, "Your Valgrind doesn't generate " \
-                                      "suppressions - is it too old?"
-
-    if self._testcase:
-      output += "The report came from the `%s` test.\n" % self._testcase
-    output += "Suppression (error hash=#%016X#):\n" % self.ErrorHash()
-    output += ("  For more info on using suppressions see "
-               "http://dev.chromium.org/developers/tree-sheriffs/sheriff-details-chromium/memory-sheriff#TOC-Suppressing-memory-reports")
-
-    # Widen suppression slightly to make portable between mac and linux
-    # TODO(timurrrr): Oops, these transformations should happen
-    # BEFORE calculating the hash!
-    supp = self._suppression;
-    supp = supp.replace("fun:_Znwj", "fun:_Znw*")
-    supp = supp.replace("fun:_Znwm", "fun:_Znw*")
-    supp = supp.replace("fun:_Znaj", "fun:_Zna*")
-    supp = supp.replace("fun:_Znam", "fun:_Zna*")
-
-    # Make suppressions even less platform-dependent.
-    for sz in [1, 2, 4, 8]:
-      supp = supp.replace("Memcheck:Addr%d" % sz, "Memcheck:Unaddressable")
-      supp = supp.replace("Memcheck:Value%d" % sz, "Memcheck:Uninitialized")
-    supp = supp.replace("Memcheck:Cond", "Memcheck:Uninitialized")
-
-    # Split into lines so we can enforce length limits
-    supplines = supp.split("\n")
-    supp = None  # to avoid re-use
-
-    # Truncate at line 26 (VG_MAX_SUPP_CALLERS plus 2 for name and type)
-    # or at the first 'boring' caller.
-    # (https://bugs.kde.org/show_bug.cgi?id=199468 proposes raising
-    # VG_MAX_SUPP_CALLERS, but we're probably fine with it as is.)
-    newlen = min(26, len(supplines));
-
-    # Drop boring frames and all the following.
-    enough_frames = False
-    for frameno in range(newlen):
-      for boring_caller in _BORING_CALLERS:
-        if re.match("^ +fun:%s$" % boring_caller, supplines[frameno]):
-          newlen = frameno
-          enough_frames = True
-          break
-      if enough_frames:
-        break
-    if (len(supplines) > newlen):
-      supplines = supplines[0:newlen]
-      supplines.append("}")
-
-    for frame in range(len(supplines)):
-      # Replace the always-changing anonymous namespace prefix with "*".
-      m = re.match("( +fun:)_ZN.*_GLOBAL__N_.*\.cc_" +
-                   "[0-9a-fA-F]{8}_[0-9a-fA-F]{8}(.*)",
-                   supplines[frame])
-      if m:
-        supplines[frame] = "*".join(m.groups())
-
-    output += "\n".join(supplines) + "\n"
-    output += "### END MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
-        self.ErrorHash()
-
-    return output
-
-  def UniqueString(self):
-    ''' String to use for object identity. Don't print this, use str(obj)
-    instead.'''
-    rep = self._kind + " "
-    for backtrace in self._backtraces:
-      for frame in backtrace[1]:
-        rep += frame[FUNCTION_NAME]
-
-        if frame[SRC_FILE_DIR] != "":
-          rep += frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME]
-        else:
-          rep += frame[OBJECT_FILE]
-
-    return rep
-
-  # This is a device-independent hash identifying the suppression.
-  # By printing out this hash we can find duplicate reports between tests and
-  # different shards running on multiple buildbots
-  def ErrorHash(self):
-    return int(hashlib.md5(self.UniqueString()).hexdigest()[:16], 16)
-
-  def __hash__(self):
-    return hash(self.UniqueString())
-  def __eq__(self, rhs):
-    return self.UniqueString() == rhs
-
-def log_is_finished(f, force_finish):
-  f.seek(0)
-  prev_line = ""
-  while True:
-    line = f.readline()
-    if line == "":
-      if not force_finish:
-        return False
-      # Okay, the log is not finished but we can make it up to be parseable:
-      if prev_line.strip() in ["</error>", "</errorcounts>", "</status>"]:
-        f.write("</valgrindoutput>\n")
-        return True
-      return False
-    if '</valgrindoutput>' in line:
-      # Valgrind often has garbage after </valgrindoutput> upon crash.
-      f.truncate()
-      return True
-    prev_line = line
-
-class MemcheckAnalyzer:
-  ''' Given a set of Valgrind XML files, parse all the errors out of them,
-  unique them and output the results.'''
-
-  SANITY_TEST_SUPPRESSIONS = {
-      "Memcheck sanity test 01 (memory leak).": 1,
-      "Memcheck sanity test 02 (malloc/read left).": 1,
-      "Memcheck sanity test 03 (malloc/read right).": 1,
-      "Memcheck sanity test 04 (malloc/write left).": 1,
-      "Memcheck sanity test 05 (malloc/write right).": 1,
-      "Memcheck sanity test 06 (new/read left).": 1,
-      "Memcheck sanity test 07 (new/read right).": 1,
-      "Memcheck sanity test 08 (new/write left).": 1,
-      "Memcheck sanity test 09 (new/write right).": 1,
-      "Memcheck sanity test 10 (write after free).": 1,
-      "Memcheck sanity test 11 (write after delete).": 1,
-      "Memcheck sanity test 12 (array deleted without []).": 1,
-      "Memcheck sanity test 13 (single element deleted with []).": 1,
-      "Memcheck sanity test 14 (malloc/read uninit).": 1,
-      "Memcheck sanity test 15 (new/read uninit).": 1,
-  }
-
-  # Max time to wait for memcheck logs to complete.
-  LOG_COMPLETION_TIMEOUT = 180.0
-
-  def __init__(self, source_dir, show_all_leaks=False, use_gdb=False):
-    '''Create a parser for Memcheck logs.
-
-    Args:
-      source_dir: Path to top of source tree for this build
-      show_all_leaks: Whether to show even less important leaks
-      use_gdb: Whether to use gdb to resolve source filenames and line numbers
-               in the report stacktraces
-    '''
-    self._source_dir = source_dir
-    self._show_all_leaks = show_all_leaks
-    self._use_gdb = use_gdb
-
-    # Contains the set of unique errors
-    self._errors = set()
-
-    # Contains the time when the we started analyzing the first log file.
-    # This variable is used to skip incomplete logs after some timeout.
-    self._analyze_start_time = None
-
-
-  def Report(self, files, testcase, check_sanity=False):
-    '''Reads in a set of files and prints Memcheck report.
-
-    Args:
-      files: A list of filenames.
-      check_sanity: if true, search for SANITY_TEST_SUPPRESSIONS
-    '''
-    # Beyond the detailed errors parsed by ValgrindError above,
-    # the xml file contain records describing suppressions that were used:
-    # <suppcounts>
-    #  <pair>
-    #    <count>28</count>
-    #    <name>pango_font_leak_todo</name>
-    #  </pair>
-    #  <pair>
-    #    <count>378</count>
-    #    <name>bug_13243</name>
-    #  </pair>
-    # </suppcounts
-    # Collect these and print them at the end.
-    #
-    # With our patch for https://bugs.kde.org/show_bug.cgi?id=205000 in,
-    # the file also includes records of the form
-    # <load_obj><obj>/usr/lib/libgcc_s.1.dylib</obj><ip>0x27000</ip></load_obj>
-    # giving the filename and load address of each binary that was mapped
-    # into the process.
-
-    global TheAddressTable
-    if self._use_gdb:
-      TheAddressTable = gdb_helper.AddressTable()
-    else:
-      TheAddressTable = None
-    cur_report_errors = set()
-    suppcounts = defaultdict(int)
-    badfiles = set()
-
-    if self._analyze_start_time == None:
-      self._analyze_start_time = time.time()
-    start_time = self._analyze_start_time
-
-    parse_failed = False
-    for file in files:
-      # Wait up to three minutes for valgrind to finish writing all files,
-      # but after that, just skip incomplete files and warn.
-      f = open(file, "r+")
-      pid = re.match(".*\.([0-9]+)$", file)
-      if pid:
-        pid = pid.groups()[0]
-      found = False
-      running = True
-      firstrun = True
-      skip = False
-      origsize = os.path.getsize(file)
-      while (running and not found and not skip and
-             (firstrun or
-              ((time.time() - start_time) < self.LOG_COMPLETION_TIMEOUT))):
-        firstrun = False
-        f.seek(0)
-        if pid:
-          # Make sure the process is still running so we don't wait for
-          # 3 minutes if it was killed. See http://crbug.com/17453
-          ps_out = subprocess.Popen("ps p %s" % pid, shell=True,
-                                    stdout=subprocess.PIPE).stdout
-          if len(ps_out.readlines()) < 2:
-            running = False
-        else:
-          skip = True
-          running = False
-        found = log_is_finished(f, False)
-        if not running and not found:
-          logging.warn("Valgrind process PID = %s is not running but its "
-                       "XML log has not been finished correctly.\n"
-                       "Make it up by adding some closing tags manually." % pid)
-          found = log_is_finished(f, not running)
-        if running and not found:
-          time.sleep(1)
-      f.close()
-      if not found:
-        badfiles.add(file)
-      else:
-        newsize = os.path.getsize(file)
-        if origsize > newsize+1:
-          logging.warn(str(origsize - newsize) +
-                       " bytes of junk were after </valgrindoutput> in %s!" %
-                       file)
-        try:
-          parsed_file = parse(file);
-        except ExpatError, e:
-          parse_failed = True
-          logging.warn("could not parse %s: %s" % (file, e))
-          lineno = e.lineno - 1
-          context_lines = 5
-          context_start = max(0, lineno - context_lines)
-          context_end = lineno + context_lines + 1
-          context_file = open(file, "r")
-          for i in range(0, context_start):
-            context_file.readline()
-          for i in range(context_start, context_end):
-            context_data = context_file.readline().rstrip()
-            if i != lineno:
-              logging.warn("  %s" % context_data)
-            else:
-              logging.warn("> %s" % context_data)
-          context_file.close()
-          continue
-        if TheAddressTable != None:
-          load_objs = parsed_file.getElementsByTagName("load_obj")
-          for load_obj in load_objs:
-            obj = getTextOf(load_obj, "obj")
-            ip = getTextOf(load_obj, "ip")
-            TheAddressTable.AddBinaryAt(obj, ip)
-
-        commandline = None
-        preamble = parsed_file.getElementsByTagName("preamble")[0];
-        for node in preamble.getElementsByTagName("line"):
-          if node.localName == "line":
-            for x in node.childNodes:
-              if x.nodeType == node.TEXT_NODE and "Command" in x.data:
-                commandline = x.data
-                break
-
-        raw_errors = parsed_file.getElementsByTagName("error")
-        for raw_error in raw_errors:
-          # Ignore "possible" leaks for now by default.
-          if (self._show_all_leaks or
-              getTextOf(raw_error, "kind") != "Leak_PossiblyLost"):
-            error = ValgrindError(self._source_dir,
-                                  raw_error, commandline, testcase)
-            if error not in cur_report_errors:
-              # We haven't seen such errors doing this report yet...
-              if error in self._errors:
-                # ... but we saw it in earlier reports, e.g. previous UI test
-                cur_report_errors.add("This error was already printed in "
-                                      "some other test, see 'hash=#%016X#'" % \
-                                      error.ErrorHash())
-              else:
-                # ... and we haven't seen it in other tests as well
-                self._errors.add(error)
-                cur_report_errors.add(error)
-
-        suppcountlist = parsed_file.getElementsByTagName("suppcounts")
-        if len(suppcountlist) > 0:
-          suppcountlist = suppcountlist[0]
-          for node in suppcountlist.getElementsByTagName("pair"):
-            count = getTextOf(node, "count");
-            name = getTextOf(node, "name");
-            suppcounts[name] += int(count)
-
-    if len(badfiles) > 0:
-      logging.warn("valgrind didn't finish writing %d files?!" % len(badfiles))
-      for file in badfiles:
-        logging.warn("Last 20 lines of %s :" % file)
-        os.system("tail -n 20 '%s' 1>&2" % file)
-
-    if parse_failed:
-      logging.error("FAIL! Couldn't parse Valgrind output file")
-      return -2
-
-    common.PrintUsedSuppressionsList(suppcounts)
-
-    retcode = 0
-    if cur_report_errors:
-      logging.error("FAIL! There were %s errors: " % len(cur_report_errors))
-
-      if TheAddressTable != None:
-        TheAddressTable.ResolveAll()
-
-      for error in cur_report_errors:
-        logging.error(error)
-
-      retcode = -1
-
-    # Report tool's insanity even if there were errors.
-    if check_sanity:
-      remaining_sanity_supp = MemcheckAnalyzer.SANITY_TEST_SUPPRESSIONS
-      for (name, count) in suppcounts.iteritems():
-        # Workaround for http://crbug.com/334074
-        if (name in remaining_sanity_supp and
-            remaining_sanity_supp[name] <= count):
-          del remaining_sanity_supp[name]
-      if remaining_sanity_supp:
-        logging.error("FAIL! Sanity check failed!")
-        logging.info("The following test errors were not handled: ")
-        for (name, count) in remaining_sanity_supp.iteritems():
-          logging.info("  * %dx %s" % (count, name))
-        retcode = -3
-
-    if retcode != 0:
-      return retcode
-
-    logging.info("PASS! No errors found!")
-    return 0
-
-
-def _main():
-  '''For testing only. The MemcheckAnalyzer class should be imported instead.'''
-  parser = optparse.OptionParser("usage: %prog [options] <files to analyze>")
-  parser.add_option("", "--source-dir",
-                    help="path to top of source tree for this build"
-                    "(used to normalize source paths in baseline)")
-
-  (options, args) = parser.parse_args()
-  if len(args) == 0:
-    parser.error("no filename specified")
-  filenames = args
-
-  analyzer = MemcheckAnalyzer(options.source_dir, use_gdb=True)
-  return analyzer.Report(filenames, None)
-
-
-if __name__ == "__main__":
-  sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/valgrind.sh b/files/tools_libyuv/valgrind/valgrind.sh
deleted file mode 100755
index 7f3f7926..00000000
--- a/files/tools_libyuv/valgrind/valgrind.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-# This is a small script for manually launching valgrind, along with passing
-# it the suppression file, and some helpful arguments (automatically attaching
-# the debugger on failures, etc).  Run it from your repo root, something like:
-#  $ sh ./tools/valgrind/valgrind.sh ./out/Debug/chrome
-#
-# This is mostly intended for running the chrome browser interactively.
-# To run unit tests, you probably want to run chrome_tests.sh instead.
-# That's the script used by the valgrind buildbot.
-
-export THISDIR=`dirname $0`
-
-setup_memcheck() {
-  RUN_COMMAND="valgrind"
-
-  # Prompt to attach gdb when there was an error detected.
-  DEFAULT_TOOL_FLAGS=("--db-command=gdb -nw %f %p" "--db-attach=yes" \
-                      # Keep the registers in gdb in sync with the code.
-                      "--vex-iropt-register-updates=allregs-at-mem-access" \
-                      # Overwrite newly allocated or freed objects
-                      # with 0x41 to catch inproper use.
-                      "--malloc-fill=41" "--free-fill=41" \
-                      # Increase the size of stacks being tracked.
-                      "--num-callers=30")
-}
-
-setup_unknown() {
-  echo "Unknown tool \"$TOOL_NAME\" specified, the result is not guaranteed"
-  DEFAULT_TOOL_FLAGS=()
-}
-
-set -e
-
-if [ $# -eq 0 ]; then
-  echo "usage: <command to run> <arguments ...>"
-  exit 1
-fi
-
-TOOL_NAME="memcheck"
-declare -a DEFAULT_TOOL_FLAGS[0]
-
-# Select a tool different from memcheck with --tool=TOOL as a first argument
-TMP_STR=`echo $1 | sed 's/^\-\-tool=//'`
-if [ "$TMP_STR" != "$1" ]; then
-  TOOL_NAME="$TMP_STR"
-  shift
-fi
-
-if echo "$@" | grep "\-\-tool" ; then
-  echo "--tool=TOOL must be the first argument" >&2
-  exit 1
-fi
-
-case $TOOL_NAME in
-  memcheck*)  setup_memcheck "$1";;
-  *)          setup_unknown;;
-esac
-
-
-SUPPRESSIONS="$THISDIR/$TOOL_NAME/suppressions.txt"
-
-CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
-if [ "$CHROME_VALGRIND" = "" ]
-then
-  # locate_valgrind.sh failed
-  exit 1
-fi
-echo "Using valgrind binaries from ${CHROME_VALGRIND}"
-
-set -x
-PATH="${CHROME_VALGRIND}/bin:$PATH"
-# We need to set these variables to override default lib paths hard-coded into
-# Valgrind binary.
-export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
-export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
-
-# G_SLICE=always-malloc: make glib use system malloc
-# NSS_DISABLE_UNLOAD=1: make nss skip dlclosing dynamically loaded modules,
-# which would result in "obj:*" in backtraces.
-# NSS_DISABLE_ARENA_FREE_LIST=1: make nss use system malloc
-# G_DEBUG=fatal_warnings: make  GTK abort on any critical or warning assertions.
-# If it crashes on you in the Options menu, you hit bug 19751,
-# comment out the G_DEBUG=fatal_warnings line.
-#
-# GTEST_DEATH_TEST_USE_FORK=1: make gtest death tests valgrind-friendly
-#
-# When everyone has the latest valgrind, we might want to add
-#  --show-possibly-lost=no
-# to ignore possible but not definite leaks.
-
-G_SLICE=always-malloc \
-NSS_DISABLE_UNLOAD=1 \
-NSS_DISABLE_ARENA_FREE_LIST=1 \
-G_DEBUG=fatal_warnings \
-GTEST_DEATH_TEST_USE_FORK=1 \
-$RUN_COMMAND \
-  --trace-children=yes \
-  --leak-check=yes \
-  --suppressions="$SUPPRESSIONS" \
-  "${DEFAULT_TOOL_FLAGS[@]}" \
-  "$@"
diff --git a/files/tools_libyuv/valgrind/valgrind_test.py b/files/tools_libyuv/valgrind/valgrind_test.py
deleted file mode 100755
index 0fd3d97f..00000000
--- a/files/tools_libyuv/valgrind/valgrind_test.py
+++ /dev/null
@@ -1,517 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS.  All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""Runs an exe through Valgrind and puts the intermediate files in a
-directory.
-"""
-
-import datetime
-import glob
-import logging
-import optparse
-import os
-import re
-import shutil
-import stat
-import subprocess
-import sys
-import tempfile
-
-import common
-
-import memcheck_analyze
-
-class BaseTool(object):
-  """Abstract class for running dynamic error detection tools.
-
-  Always subclass this and implement ToolCommand with framework- and
-  tool-specific stuff.
-  """
-
-  def __init__(self):
-    temp_parent_dir = None
-    self.log_parent_dir = ""
-    if common.IsWindows():
-      # gpu process on Windows Vista+ runs at Low Integrity and can only
-      # write to certain directories (http://crbug.com/119131)
-      #
-      # TODO(bruening): if scripts die in middle and don't clean up temp
-      # dir, we'll accumulate files in profile dir.  should remove
-      # really old files automatically.
-      profile = os.getenv("USERPROFILE")
-      if profile:
-        self.log_parent_dir = profile + "\\AppData\\LocalLow\\"
-        if os.path.exists(self.log_parent_dir):
-          self.log_parent_dir = common.NormalizeWindowsPath(self.log_parent_dir)
-          temp_parent_dir = self.log_parent_dir
-    # Generated every time (even when overridden)
-    self.temp_dir = tempfile.mkdtemp(prefix="vg_logs_", dir=temp_parent_dir)
-    self.log_dir = self.temp_dir # overridable by --keep_logs
-    self.option_parser_hooks = []
-    # TODO(glider): we may not need some of the env vars on some of the
-    # platforms.
-    self._env = {
-      "G_SLICE" : "always-malloc",
-      "NSS_DISABLE_UNLOAD" : "1",
-      "NSS_DISABLE_ARENA_FREE_LIST" : "1",
-      "GTEST_DEATH_TEST_USE_FORK": "1",
-    }
-
-  def ToolName(self):
-    raise NotImplementedError, "This method should be implemented " \
-                               "in the tool-specific subclass"
-
-  def Analyze(self, check_sanity=False):
-    raise NotImplementedError, "This method should be implemented " \
-                               "in the tool-specific subclass"
-
-  def RegisterOptionParserHook(self, hook):
-    # Frameworks and tools can add their own flags to the parser.
-    self.option_parser_hooks.append(hook)
-
-  def CreateOptionParser(self):
-    # Defines Chromium-specific flags.
-    self._parser = optparse.OptionParser("usage: %prog [options] <program to "
-                                         "test>")
-    self._parser.disable_interspersed_args()
-    self._parser.add_option("-t", "--timeout",
-                      dest="timeout", metavar="TIMEOUT", default=10000,
-                      help="timeout in seconds for the run (default 10000)")
-    self._parser.add_option("", "--build-dir",
-                            help="the location of the compiler output")
-    self._parser.add_option("", "--source-dir",
-                            help="path to top of source tree for this build"
-                                 "(used to normalize source paths in baseline)")
-    self._parser.add_option("", "--gtest_filter", default="",
-                            help="which test case to run")
-    self._parser.add_option("", "--gtest_repeat",
-                            help="how many times to run each test")
-    self._parser.add_option("", "--gtest_print_time", action="store_true",
-                            default=False,
-                            help="show how long each test takes")
-    self._parser.add_option("", "--ignore_exit_code", action="store_true",
-                            default=False,
-                            help="ignore exit code of the test "
-                                 "(e.g. test failures)")
-    self._parser.add_option("", "--keep_logs", action="store_true",
-                            default=False,
-                            help="store memory tool logs in the <tool>.logs "
-                                 "directory instead of /tmp.\nThis can be "
-                                 "useful for tool developers/maintainers.\n"
-                                 "Please note that the <tool>.logs directory "
-                                 "will be clobbered on tool startup.")
-
-    # To add framework- or tool-specific flags, please add a hook using
-    # RegisterOptionParserHook in the corresponding subclass.
-    # See ValgrindTool for an example.
-    for hook in self.option_parser_hooks:
-      hook(self, self._parser)
-
-  def ParseArgv(self, args):
-    self.CreateOptionParser()
-
-    # self._tool_flags will store those tool flags which we don't parse
-    # manually in this script.
-    self._tool_flags = []
-    known_args = []
-
-    """ We assume that the first argument not starting with "-" is a program
-    name and all the following flags should be passed to the program.
-    TODO(timurrrr): customize optparse instead
-    """
-    while len(args) > 0 and args[0][:1] == "-":
-      arg = args[0]
-      if (arg == "--"):
-        break
-      if self._parser.has_option(arg.split("=")[0]):
-        known_args += [arg]
-      else:
-        self._tool_flags += [arg]
-      args = args[1:]
-
-    if len(args) > 0:
-      known_args += args
-
-    self._options, self._args = self._parser.parse_args(known_args)
-
-    self._timeout = int(self._options.timeout)
-    self._source_dir = self._options.source_dir
-    if self._options.keep_logs:
-      # log_parent_dir has trailing slash if non-empty
-      self.log_dir = self.log_parent_dir + "%s.logs" % self.ToolName()
-      if os.path.exists(self.log_dir):
-        shutil.rmtree(self.log_dir)
-      os.mkdir(self.log_dir)
-      logging.info("Logs are in " + self.log_dir)
-
-    self._ignore_exit_code = self._options.ignore_exit_code
-    if self._options.gtest_filter != "":
-      self._args.append("--gtest_filter=%s" % self._options.gtest_filter)
-    if self._options.gtest_repeat:
-      self._args.append("--gtest_repeat=%s" % self._options.gtest_repeat)
-    if self._options.gtest_print_time:
-      self._args.append("--gtest_print_time")
-
-    return True
-
-  def Setup(self, args):
-    return self.ParseArgv(args)
-
-  def ToolCommand(self):
-    raise NotImplementedError, "This method should be implemented " \
-                               "in the tool-specific subclass"
-
-  def Cleanup(self):
-    # You may override it in the tool-specific subclass
-    pass
-
-  def Execute(self):
-    """ Execute the app to be tested after successful instrumentation.
-    Full execution command-line provided by subclassers via proc."""
-    logging.info("starting execution...")
-    proc = self.ToolCommand()
-    for var in self._env:
-      common.PutEnvAndLog(var, self._env[var])
-    return common.RunSubprocess(proc, self._timeout)
-
-  def RunTestsAndAnalyze(self, check_sanity):
-    exec_retcode = self.Execute()
-    analyze_retcode = self.Analyze(check_sanity)
-
-    if analyze_retcode:
-      logging.error("Analyze failed.")
-      logging.info("Search the log for '[ERROR]' to see the error reports.")
-      return analyze_retcode
-
-    if exec_retcode:
-      if self._ignore_exit_code:
-        logging.info("Test execution failed, but the exit code is ignored.")
-      else:
-        logging.error("Test execution failed.")
-        return exec_retcode
-    else:
-      logging.info("Test execution completed successfully.")
-
-    if not analyze_retcode:
-      logging.info("Analysis completed successfully.")
-
-    return 0
-
-  def Main(self, args, check_sanity, min_runtime_in_seconds):
-    """Call this to run through the whole process: Setup, Execute, Analyze"""
-    start_time = datetime.datetime.now()
-    retcode = -1
-    if self.Setup(args):
-      retcode = self.RunTestsAndAnalyze(check_sanity)
-      shutil.rmtree(self.temp_dir, ignore_errors=True)
-      self.Cleanup()
-    else:
-      logging.error("Setup failed")
-    end_time = datetime.datetime.now()
-    runtime_in_seconds = (end_time - start_time).seconds
-    hours = runtime_in_seconds / 3600
-    seconds = runtime_in_seconds % 3600
-    minutes = seconds / 60
-    seconds = seconds % 60
-    logging.info("elapsed time: %02d:%02d:%02d" % (hours, minutes, seconds))
-    if (min_runtime_in_seconds > 0 and
-        runtime_in_seconds < min_runtime_in_seconds):
-      logging.error("Layout tests finished too quickly. "
-                    "It should have taken at least %d seconds. "
-                    "Something went wrong?" % min_runtime_in_seconds)
-      retcode = -1
-    return retcode
-
-  def Run(self, args, module, min_runtime_in_seconds=0):
-    MODULES_TO_SANITY_CHECK = ["base"]
-
-    check_sanity = module in MODULES_TO_SANITY_CHECK
-    return self.Main(args, check_sanity, min_runtime_in_seconds)
-
-
-class ValgrindTool(BaseTool):
-  """Abstract class for running Valgrind tools.
-
-  Always subclass this and implement ToolSpecificFlags() and
-  ExtendOptionParser() for tool-specific stuff.
-  """
-  def __init__(self):
-    super(ValgrindTool, self).__init__()
-    self.RegisterOptionParserHook(ValgrindTool.ExtendOptionParser)
-
-  def UseXML(self):
-    # Override if tool prefers nonxml output
-    return True
-
-  def ExtendOptionParser(self, parser):
-    parser.add_option("", "--suppressions", default=[],
-                            action="append",
-                            help="path to a valgrind suppression file")
-    parser.add_option("", "--indirect", action="store_true",
-                            default=False,
-                            help="set BROWSER_WRAPPER rather than "
-                                 "running valgrind directly")
-    parser.add_option("", "--indirect_webkit_layout", action="store_true",
-                            default=False,
-                            help="set --wrapper rather than running Dr. Memory "
-                                 "directly.")
-    parser.add_option("", "--trace_children", action="store_true",
-                            default=False,
-                            help="also trace child processes")
-    parser.add_option("", "--num-callers",
-                            dest="num_callers", default=30,
-                            help="number of callers to show in stack traces")
-    parser.add_option("", "--generate_dsym", action="store_true",
-                          default=False,
-                          help="Generate .dSYM file on Mac if needed. Slow!")
-
-  def Setup(self, args):
-    if not BaseTool.Setup(self, args):
-      return False
-    return True
-
-  def ToolCommand(self):
-    """Get the valgrind command to run."""
-    # Note that self._args begins with the exe to be run.
-    tool_name = self.ToolName()
-
-    # Construct the valgrind command.
-    if 'CHROME_VALGRIND' in os.environ:
-      path = os.path.join(os.environ['CHROME_VALGRIND'], "bin", "valgrind")
-    else:
-      path = "valgrind"
-    proc = [path, "--tool=%s" % tool_name]
-
-    proc += ["--num-callers=%i" % int(self._options.num_callers)]
-
-    if self._options.trace_children:
-      proc += ["--trace-children=yes"]
-      proc += ["--trace-children-skip='*dbus-daemon*'"]
-      proc += ["--trace-children-skip='*dbus-launch*'"]
-      proc += ["--trace-children-skip='*perl*'"]
-      proc += ["--trace-children-skip='*python*'"]
-      # This is really Python, but for some reason Valgrind follows it.
-      proc += ["--trace-children-skip='*lsb_release*'"]
-
-    proc += self.ToolSpecificFlags()
-    proc += self._tool_flags
-
-    suppression_count = 0
-    for suppression_file in self._options.suppressions:
-      if os.path.exists(suppression_file):
-        suppression_count += 1
-        proc += ["--suppressions=%s" % suppression_file]
-
-    if not suppression_count:
-      logging.warning("WARNING: NOT USING SUPPRESSIONS!")
-
-    logfilename = self.log_dir + ("/%s." % tool_name) + "%p"
-    if self.UseXML():
-      proc += ["--xml=yes", "--xml-file=" + logfilename]
-    else:
-      proc += ["--log-file=" + logfilename]
-
-    # The Valgrind command is constructed.
-
-    # Handle --indirect_webkit_layout separately.
-    if self._options.indirect_webkit_layout:
-      # Need to create the wrapper before modifying |proc|.
-      wrapper = self.CreateBrowserWrapper(proc, webkit=True)
-      proc = self._args
-      proc.append("--wrapper")
-      proc.append(wrapper)
-      return proc
-
-    if self._options.indirect:
-      wrapper = self.CreateBrowserWrapper(proc)
-      os.environ["BROWSER_WRAPPER"] = wrapper
-      logging.info('export BROWSER_WRAPPER=' + wrapper)
-      proc = []
-    proc += self._args
-    return proc
-
-  def ToolSpecificFlags(self):
-    raise NotImplementedError, "This method should be implemented " \
-                               "in the tool-specific subclass"
-
-  def CreateBrowserWrapper(self, proc, webkit=False):
-    """The program being run invokes Python or something else that can't stand
-    to be valgrinded, and also invokes the Chrome browser. In this case, use a
-    magic wrapper to only valgrind the Chrome browser. Build the wrapper here.
-    Returns the path to the wrapper. It's up to the caller to use the wrapper
-    appropriately.
-    """
-    command = " ".join(proc)
-    # Add the PID of the browser wrapper to the logfile names so we can
-    # separate log files for different UI tests at the analyze stage.
-    command = command.replace("%p", "$$.%p")
-
-    (fd, indirect_fname) = tempfile.mkstemp(dir=self.log_dir,
-                                            prefix="browser_wrapper.",
-                                            text=True)
-    f = os.fdopen(fd, "w")
-    f.write('#!/bin/bash\n'
-            'echo "Started Valgrind wrapper for this test, PID=$$" >&2\n')
-
-    f.write('DIR=`dirname $0`\n'
-            'TESTNAME_FILE=$DIR/testcase.$$.name\n\n')
-
-    if webkit:
-      # Webkit layout_tests pass the URL as the first line of stdin.
-      f.write('tee $TESTNAME_FILE | %s "$@"\n' % command)
-    else:
-      # Try to get the test case name by looking at the program arguments.
-      # i.e. Chromium ui_tests used --test-name arg.
-      # TODO(timurrrr): This doesn't handle "--test-name Test.Name"
-      # TODO(timurrrr): ui_tests are dead. Where do we use the non-webkit
-      # wrapper now? browser_tests? What do they do?
-      f.write('for arg in $@\ndo\n'
-              '  if [[ "$arg" =~ --test-name=(.*) ]]\n  then\n'
-              '    echo ${BASH_REMATCH[1]} >$TESTNAME_FILE\n'
-              '  fi\n'
-              'done\n\n'
-              '%s "$@"\n' % command)
-
-    f.close()
-    os.chmod(indirect_fname, stat.S_IRUSR|stat.S_IXUSR)
-    return indirect_fname
-
-  def CreateAnalyzer(self):
-    raise NotImplementedError, "This method should be implemented " \
-                               "in the tool-specific subclass"
-
-  def GetAnalyzeResults(self, check_sanity=False):
-    # Glob all the files in the log directory
-    filenames = glob.glob(self.log_dir + "/" + self.ToolName() + ".*")
-
-    # If we have browser wrapper, the logfiles are named as
-    # "toolname.wrapper_PID.valgrind_PID".
-    # Let's extract the list of wrapper_PIDs and name it ppids
-    ppids = set([int(f.split(".")[-2]) \
-                for f in filenames if re.search("\.[0-9]+\.[0-9]+$", f)])
-
-    analyzer = self.CreateAnalyzer()
-    if len(ppids) == 0:
-      # Fast path - no browser wrapper was set.
-      return analyzer.Report(filenames, None, check_sanity)
-
-    ret = 0
-    for ppid in ppids:
-      testcase_name = None
-      try:
-        f = open(self.log_dir + ("/testcase.%d.name" % ppid))
-        testcase_name = f.read().strip()
-        f.close()
-        wk_layout_prefix="third_party/WebKit/LayoutTests/"
-        wk_prefix_at = testcase_name.rfind(wk_layout_prefix)
-        if wk_prefix_at != -1:
-          testcase_name = testcase_name[wk_prefix_at + len(wk_layout_prefix):]
-      except IOError:
-        pass
-      print "====================================================="
-      print " Below is the report for valgrind wrapper PID=%d." % ppid
-      if testcase_name:
-        print " It was used while running the `%s` test." % testcase_name
-      else:
-        print " You can find the corresponding test"
-        print " by searching the above log for 'PID=%d'" % ppid
-      sys.stdout.flush()
-
-      ppid_filenames = [f for f in filenames \
-                        if re.search("\.%d\.[0-9]+$" % ppid, f)]
-      # check_sanity won't work with browser wrappers
-      assert check_sanity == False
-      ret |= analyzer.Report(ppid_filenames, testcase_name)
-      print "====================================================="
-      sys.stdout.flush()
-
-    if ret != 0:
-      print ""
-      print "The Valgrind reports are grouped by test names."
-      print "Each test has its PID printed in the log when the test was run"
-      print "and at the beginning of its Valgrind report."
-      print "Hint: you can search for the reports by Ctrl+F -> `=#`"
-      sys.stdout.flush()
-
-    return ret
-
-
-# TODO(timurrrr): Split into a separate file.
-class Memcheck(ValgrindTool):
-  """Memcheck
-  Dynamic memory error detector for Linux & Mac
-
-  http://valgrind.org/info/tools.html#memcheck
-  """
-
-  def __init__(self):
-    super(Memcheck, self).__init__()
-    self.RegisterOptionParserHook(Memcheck.ExtendOptionParser)
-
-  def ToolName(self):
-    return "memcheck"
-
-  def ExtendOptionParser(self, parser):
-    parser.add_option("--leak-check", "--leak_check", type="string",
-                      default="yes",  # --leak-check=yes is equivalent of =full
-                      help="perform leak checking at the end of the run")
-    parser.add_option("", "--show_all_leaks", action="store_true",
-                      default=False,
-                      help="also show less blatant leaks")
-    parser.add_option("", "--track_origins", action="store_true",
-                      default=False,
-                      help="Show whence uninitialized bytes came. 30% slower.")
-
-  def ToolSpecificFlags(self):
-    ret = ["--gen-suppressions=all", "--demangle=no"]
-    ret += ["--leak-check=%s" % self._options.leak_check]
-
-    if self._options.show_all_leaks:
-      ret += ["--show-reachable=yes"]
-    else:
-      ret += ["--show-possibly-lost=no"]
-
-    if self._options.track_origins:
-      ret += ["--track-origins=yes"]
-
-    # TODO(glider): this is a temporary workaround for http://crbug.com/51716
-    # Let's see whether it helps.
-    if common.IsMac():
-      ret += ["--smc-check=all"]
-
-    return ret
-
-  def CreateAnalyzer(self):
-    use_gdb = common.IsMac()
-    return memcheck_analyze.MemcheckAnalyzer(self._source_dir,
-                                            self._options.show_all_leaks,
-                                            use_gdb=use_gdb)
-
-  def Analyze(self, check_sanity=False):
-    ret = self.GetAnalyzeResults(check_sanity)
-
-    if ret != 0:
-      logging.info("Please see http://dev.chromium.org/developers/how-tos/"
-                   "using-valgrind for the info on Memcheck/Valgrind")
-    return ret
-
-
-class ToolFactory:
-  def Create(self, tool_name):
-    if tool_name == "memcheck":
-      return Memcheck()
-    try:
-      platform_name = common.PlatformNames()[0]
-    except common.NotImplementedError:
-      platform_name = sys.platform + "(Unknown)"
-    raise RuntimeError, "Unknown tool (tool=%s, platform=%s)" % (tool_name,
-                                                                 platform_name)
-
-def CreateTool(tool):
-  return ToolFactory().Create(tool)
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
deleted file mode 100644
index 32a4cd1c..00000000
--- a/files/unit_test/convert_test.cc
+++ /dev/null
@@ -1,3223 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "../unit_test/unit_test.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "libyuv/video_common.h"
-
-#if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
-#define ARM_YUV_ERROR 4
-#else
-#define ARM_YUV_ERROR 0
-#endif
-
-namespace libyuv {
-
-// Alias to copy pixels as is
-#define AR30ToAR30 ARGBCopy
-#define ABGRToABGR ARGBCopy
-
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
-
-// Planar test
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
-                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF)      \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
-                  "DST SRC_SUBSAMP_X unsupported");                           \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
-                  "DST SRC_SUBSAMP_Y unsupported");                           \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
-                  "DST DST_SUBSAMP_X unsupported");                           \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
-                  "DST DST_SUBSAMP_Y unsupported");                           \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
-    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
-    align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
-    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        reinterpret_cast<SRC_T*>(src_y + OFF), kWidth,                        \
-        reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth,                 \
-        reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth,                 \
-        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
-        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
-        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
-        NEG kHeight);                                                         \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          reinterpret_cast<SRC_T*>(src_y + OFF), kWidth,                      \
-          reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth,               \
-          reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth,               \
-          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
-          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
-          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
-          NEG kHeight);                                                       \
-    }                                                                         \
-    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
-    }                                                                         \
-    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
-      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
-      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_u_c);                                    \
-    free_aligned_buffer_page_end(dst_v_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_u_opt);                                  \
-    free_aligned_buffer_page_end(dst_v_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-  }
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
-                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
-                      DST_SUBSAMP_X, DST_SUBSAMP_Y)                            \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_ - 4, _Any, +, 0)                             \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Unaligned, +, 1)                           \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Invert, -, 0)                              \
-  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
-                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
-
-// Test Android 420 to I420
-#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
-                        SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                        W1280, N, NEG, OFF, PN, OFF_U, OFF_V)                 \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) {       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSizeUV =                                                       \
-        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    uint8_t* src_u = src_uv + OFF_U;                                          \
-    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
-    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
-        src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
-            (fastrand() & 0xff);                                              \
-        src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
-            (fastrand() & 0xff);                                              \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_u_c, 2,                                                        \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_v_c, 3,                                                        \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_u_opt, 102,                                                    \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_v_opt, 103,                                                    \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
-        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
-        kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,               \
-        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                   \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
-          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE,        \
-          dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
-          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);      \
-    }                                                                         \
-    int max_diff = 0;                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_EQ(0, max_diff);                                                   \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        int abs_diff = abs(                                                   \
-            static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
-            static_cast<int>(                                                 \
-                dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 3);                                                   \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        int abs_diff = abs(                                                   \
-            static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
-            static_cast<int>(                                                 \
-                dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 3);                                                   \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_u_c);                                    \
-    free_aligned_buffer_page_end(dst_v_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_u_opt);                                  \
-    free_aligned_buffer_page_end(dst_v_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
-  }
-
-#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
-                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
-                       SUBSAMP_Y)                                              \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4,      \
-                  _Any, +, 0, PN, OFF_U, OFF_V)                                \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
-                  _Unaligned, +, 1, PN, OFF_U, OFF_V)                          \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
-                  -, 0, PN, OFF_U, OFF_V)                                      \
-  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
-                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
-                  0, PN, OFF_U, OFF_V)
-
-TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
-TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
-
-// wrapper to keep API the same
-int I400ToNV21(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* /* src_u */,
-               int /* src_stride_u */,
-               const uint8_t* /* src_v */,
-               int /* src_stride_v */,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
-                    dst_stride_vu, width, height);
-}
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                     OFF);                                    \
-    align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                     OFF);                                    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *        \
-                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *      \
-                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
-        src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =             \
-            (fastrand() & 0xff);                                              \
-        src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =             \
-            (fastrand() & 0xff);                                              \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_uv_c, 2,                                                       \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_uv_opt, 102,                                                   \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
-        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth,       \
-        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);     \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
-          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth,   \
-          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
-    }                                                                         \
-    int max_diff = 0;                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 1);                                                   \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {            \
-        int abs_diff =                                                        \
-            abs(static_cast<int>(                                             \
-                    dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -     \
-                static_cast<int>(                                             \
-                    dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));   \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 1);                                                   \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_uv_c);                                   \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_uv_opt);                                 \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-  }
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                    \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0)   \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0)    \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
-TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
-TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
-
-#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
-                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG,     \
-                          OFF)                                                 \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 *       \
-                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                      OFF);                                    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *         \
-                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *       \
-                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kWidth; ++j)                                         \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {              \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {             \
-        src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] =     \
-            (fastrand() & 0xff);                                               \
-        src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] =     \
-            (fastrand() & 0xff);                                               \
-      }                                                                        \
-    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_uv_c, 2,                                                        \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_uv_opt, 102,                                                    \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
-        src_y + OFF, kWidth, src_uv + OFF,                                     \
-        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c,       \
-        SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);                \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
-          src_y + OFF, kWidth, src_uv + OFF,                                   \
-          SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \
-          SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);              \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth; ++j) {                                       \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -         \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));       \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {             \
-        int abs_diff =                                                         \
-            abs(static_cast<int>(                                              \
-                    dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -      \
-                static_cast<int>(                                              \
-                    dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));    \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
-  }
-
-#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                    \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)       \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0)   \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1)  \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0)    \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-
-// TODO(fbarchard): Fix msan on this unittest
-// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
-                         DOY)                                                  \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *       \
-                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                      OFF);                                    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kWidth; ++j)                                         \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {              \
-      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {         \
-        src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =         \
-            (fastrand() & 0xff);                                               \
-      }                                                                        \
-    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_u_c, 2,                                                         \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_v_c, 3,                                                         \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_u_opt, 102,                                                     \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_v_opt, 103,                                                     \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
-        src_y + OFF, kWidth, src_uv + OFF,                                     \
-        2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth,    \
-        dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,                        \
-        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
-          src_y + OFF, kWidth, src_uv + OFF,                                   \
-          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL,        \
-          kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt,          \
-          SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                  \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    if (DOY) {                                                                 \
-      for (int i = 0; i < kHeight; ++i) {                                      \
-        for (int j = 0; j < kWidth; ++j) {                                     \
-          int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -       \
-                             static_cast<int>(dst_y_opt[i * kWidth + j]));     \
-          if (abs_diff > max_diff) {                                           \
-            max_diff = abs_diff;                                               \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-      EXPECT_LE(max_diff, 1);                                                  \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        int abs_diff = abs(                                                    \
-            static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -  \
-            static_cast<int>(                                                  \
-                dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));             \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        int abs_diff = abs(                                                    \
-            static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -  \
-            static_cast<int>(                                                  \
-                dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));             \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, 1);                                                    \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_u_c);                                     \
-    free_aligned_buffer_page_end(dst_v_c);                                     \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_u_opt);                                   \
-    free_aligned_buffer_page_end(dst_v_opt);                                   \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
-  }
-
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                     \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,  \
-                   1)                                                         \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1)  \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1)     \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
-
-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
-
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN, W1280, N, NEG, OFF)                            \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      src_y[i + OFF] = (fastrand() & 0xff);                                   \
-    }                                                                         \
-    for (int i = 0; i < kSizeUV; ++i) {                                       \
-      src_u[i + OFF] = (fastrand() & 0xff);                                   \
-      src_v[i + OFF] = (fastrand() & 0xff);                                   \
-    }                                                                         \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                          \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                      \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    double time0 = get_time();                                                \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
-                          src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
-                          kWidth, NEG kHeight);                               \
-    double time1 = get_time();                                                \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
-                            src_v + OFF, kStrideUV, dst_argb_opt + OFF,       \
-                            kStrideB, kWidth, NEG kHeight);                   \
-    }                                                                         \
-    double time2 = get_time();                                                \
-    printf(" %8d us C - %8d us OPT\n",                                        \
-           static_cast<int>((time1 - time0) * 1e6),                           \
-           static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_));  \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
-    }                                                                         \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                               \
-  }
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                      YALIGN)                                                \
-  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_ - 4, _Any, +, 0)                   \
-  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_, _Unaligned, +, 1)                 \
-  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_, _Invert, -, 0)                    \
-  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                 YALIGN, benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
-TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
-TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
-TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
-TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
-TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
-TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
-TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
-
-#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)               \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      src_y[i + OFF] = (fastrand() & 0xff);                                    \
-      src_a[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      src_u[i + OFF] = (fastrand() & 0xff);                                    \
-      src_v[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,         \
-                          src_v + OFF, kStrideUV, src_a + OFF, kWidth,         \
-                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
-                          ATTEN);                                              \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,       \
-                            src_v + OFF, kStrideUV, src_a + OFF, kWidth,       \
-                            dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
-                            ATTEN);                                            \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i + OFF]) -               \
-                         static_cast<int>(dst_argb_opt[i + OFF]));             \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(src_a);                                       \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-  }
-
-#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN, DIFF)                                          \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)          \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0)        \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)           \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)              \
-  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                  YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
-
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
-TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
-                         BPP_B, W1280, DIFF, N, NEG, OFF)                      \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    const int kStrideB = kWidth * BPP_B;                                       \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv,                                              \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kWidth; ++j)                                         \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
-        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
-      }                                                                        \
-    }                                                                          \
-    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
-    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
-                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
-                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
-                            NEG kHeight);                                      \
-    }                                                                          \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
-    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
-    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
-    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
-    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
-                  kHeight);                                                    \
-    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
-                  kHeight);                                                    \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth * 4; ++j) {                                   \
-        int abs_diff =                                                         \
-            abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -           \
-                static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));         \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-    free_aligned_buffer_page_end(dst_argb32_c);                                \
-    free_aligned_buffer_page_end(dst_argb32_opt);                              \
-  }
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
-                        DIFF)                                                  \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, DIFF, _Invert, -, 0)                      \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
-
-#ifdef DO_THREE_PLANES
-// Do 3 allocations for yuv.  conventional but slower.
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                       W1280, DIFF, N, NEG, OFF)                               \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    align_buffer_page_end(dst_v_c, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));  \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt,                                           \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));           \
-    align_buffer_page_end(dst_v_opt,                                           \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));           \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_u_c, 2, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));              \
-    memset(dst_v_c, 3, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));              \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_u_opt, 102, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));          \
-    memset(dst_v_opt, 103, kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y));          \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kStride; ++j)                                        \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_u_c,   \
-                          kStrideUV, dst_v_c, kStrideUV, kWidth, NEG kHeight); \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
-                            dst_u_opt, kStrideUV, dst_v_opt, kStrideUV,        \
-                            kWidth, NEG kHeight);                              \
-    }                                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth; ++j) {                                       \
-        EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                 \
-                    static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);        \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]),              \
-                    static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF);     \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_v_c[i * kStrideUV + j]),              \
-                    static_cast<int>(dst_v_opt[i * kStrideUV + j]), DIFF);     \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_u_c);                                     \
-    free_aligned_buffer_page_end(dst_v_c);                                     \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_u_opt);                                   \
-    free_aligned_buffer_page_end(dst_v_opt);                                   \
-    free_aligned_buffer_page_end(src_argb);                                    \
-  }
-#else
-#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                       W1280, DIFF, N, NEG, OFF)                               \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_uv_c,                                            \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_uv_opt,                                          \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kStride; ++j)                                        \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
-                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
-                          kWidth, NEG kHeight);                                \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
-                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
-                            kStrideUV * 2, kWidth, NEG kHeight);               \
-    }                                                                          \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth; ++j) {                                       \
-        EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                 \
-                    static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);        \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
-      for (int j = 0; j < kStrideUV; ++j) {                                    \
-        EXPECT_NEAR(static_cast<int>(dst_uv_c[i * kStrideUV + j]),             \
-                    static_cast<int>(dst_uv_opt[i * kStrideUV + j]), DIFF);    \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-    free_aligned_buffer_page_end(src_argb);                                    \
-  }
-#endif
-
-#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
-                      DIFF)                                                   \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_ - 4, DIFF, _Any, +, 0)                      \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Unaligned, +, 1)                    \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Invert, -, 0)                       \
-  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                 benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-// TODO(fbarchard): Investigate J420 error of 11 on Windows.
-TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 11)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
-
-#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
-                         SUBSAMP_Y, W1280, N, NEG, OFF)                       \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c,                                           \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt,                                         \
-                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kStride; ++j)                                       \
-        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
-                          kStrideUV * 2, kWidth, NEG kHeight);                \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,       \
-                            dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);  \
-    }                                                                         \
-    int max_diff = 0;                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -        \
-                           static_cast<int>(dst_y_opt[i * kWidth + j]));      \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        int abs_diff =                                                        \
-            abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) -           \
-                static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j]));         \
-        if (abs_diff > max_diff) {                                            \
-          max_diff = abs_diff;                                                \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_uv_c);                                   \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_uv_opt);                                 \
-    free_aligned_buffer_page_end(src_argb);                                   \
-  }
-
-#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Unaligned, +, 1)                         \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Invert, -, 0)                            \
-  TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
-                   benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
-TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
-TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
-                  HEIGHT_B, W1280, DIFF, N, NEG, OFF)                        \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                           \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
-    const int kHeight = benchmark_height_;                                   \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
-    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
-    const int kStrideA =                                                     \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
-    const int kStrideB =                                                     \
-        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
-      src_argb[i + OFF] = (fastrand() & 0xff);                               \
-    }                                                                        \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
-    MaskCpuFlags(disable_cpu_flags_);                                        \
-    FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \
-                     NEG kHeight);                                           \
-    MaskCpuFlags(benchmark_cpu_info_);                                       \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                        \
-      FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB,     \
-                       kWidth, NEG kHeight);                                 \
-    }                                                                        \
-    int max_diff = 0;                                                        \
-    for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                         static_cast<int>(dst_argb_opt[i]));                 \
-      if (abs_diff > max_diff) {                                             \
-        max_diff = abs_diff;                                                 \
-      }                                                                      \
-    }                                                                        \
-    EXPECT_LE(max_diff, DIFF);                                               \
-    free_aligned_buffer_page_end(src_argb);                                  \
-    free_aligned_buffer_page_end(dst_argb_c);                                \
-    free_aligned_buffer_page_end(dst_argb_opt);                              \
-  }
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,     \
-                       STRIDE_B, HEIGHT_B, DIFF)                           \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                   \
-    for (int times = 0; times < benchmark_iterations_; ++times) {          \
-      const int kWidth = (fastrand() & 63) + 1;                            \
-      const int kHeight = (fastrand() & 31) + 1;                           \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
-      const int kStrideA =                                                 \
-          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;           \
-      const int kStrideB =                                                 \
-          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;           \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA);                 \
-      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);               \
-      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);             \
-      for (int i = 0; i < kStrideA * kHeightA; ++i) {                      \
-        src_argb[i] = (fastrand() & 0xff);                                 \
-      }                                                                    \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                         \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                       \
-      MaskCpuFlags(disable_cpu_flags_);                                    \
-      FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth,   \
-                       kHeight);                                           \
-      MaskCpuFlags(benchmark_cpu_info_);                                   \
-      FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
-                       kHeight);                                           \
-      for (int i = 0; i < kStrideB * kHeightB; ++i) {                      \
-        EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF);                 \
-      }                                                                    \
-      free_aligned_buffer_page_end(src_argb);                              \
-      free_aligned_buffer_page_end(dst_argb_c);                            \
-      free_aligned_buffer_page_end(dst_argb_opt);                          \
-    }                                                                      \
-  }
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B, DIFF)                                           \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0)              \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1)            \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0)               \
-  TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-            HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0)                  \
-  TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                 HEIGHT_B, DIFF)
-
-// TODO(fbarchard): make ARM version of C code that matches NEON.
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
-TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-
-#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                   HEIGHT_B, W1280, DIFF, N, NEG, OFF)                       \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                          \
-    const int kHeight = benchmark_height_;                                   \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
-    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
-    const int kStrideA =                                                     \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
-    const int kStrideB =                                                     \
-        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
-      src_argb[i + OFF] = (fastrand() & 0xff);                               \
-    }                                                                        \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
-    MaskCpuFlags(disable_cpu_flags_);                                        \
-    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
-                             NULL, kWidth, NEG kHeight);                     \
-    MaskCpuFlags(benchmark_cpu_info_);                                       \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                        \
-      FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt,       \
-                               kStrideB, NULL, kWidth, NEG kHeight);         \
-    }                                                                        \
-    int max_diff = 0;                                                        \
-    for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                         static_cast<int>(dst_argb_opt[i]));                 \
-      if (abs_diff > max_diff) {                                             \
-        max_diff = abs_diff;                                                 \
-      }                                                                      \
-    }                                                                        \
-    EXPECT_LE(max_diff, DIFF);                                               \
-    free_aligned_buffer_page_end(src_argb);                                  \
-    free_aligned_buffer_page_end(dst_argb_c);                                \
-    free_aligned_buffer_page_end(dst_argb_opt);                              \
-  }
-
-#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,        \
-                        STRIDE_B, HEIGHT_B, DIFF)                              \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) {                 \
-    for (int times = 0; times < benchmark_iterations_; ++times) {              \
-      const int kWidth = (fastrand() & 63) + 1;                                \
-      const int kHeight = (fastrand() & 31) + 1;                               \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
-      const int kStrideA =                                                     \
-          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
-      const int kStrideB =                                                     \
-          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
-      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
-      for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
-        src_argb[i] = (fastrand() & 0xff);                                     \
-      }                                                                        \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
-      MaskCpuFlags(disable_cpu_flags_);                                        \
-      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
-                               kWidth, kHeight);                               \
-      MaskCpuFlags(benchmark_cpu_info_);                                       \
-      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB,     \
-                               NULL, kWidth, kHeight);                         \
-      int max_diff = 0;                                                        \
-      for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-        int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                   \
-                           static_cast<int>(dst_argb_opt[i]));                 \
-        if (abs_diff > max_diff) {                                             \
-          max_diff = abs_diff;                                                 \
-        }                                                                      \
-      }                                                                        \
-      EXPECT_LE(max_diff, DIFF);                                               \
-      free_aligned_buffer_page_end(src_argb);                                  \
-      free_aligned_buffer_page_end(dst_argb_c);                                \
-      free_aligned_buffer_page_end(dst_argb_opt);                              \
-    }                                                                          \
-  }
-
-#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                  HEIGHT_B, DIFF)                                           \
-  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_ - 4, DIFF, _Any, +, 0)              \
-  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Unaligned, +, 1)            \
-  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Invert, -, 0)               \
-  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
-             HEIGHT_B, benchmark_width_, DIFF, _Opt, +, 0)                  \
-  TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
-                  HEIGHT_B, DIFF)
-
-TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-
-#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF)      \
-  TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                          \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
-    const int kStrideA =                                                       \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);                 \
-    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA);                     \
-    align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA);                   \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
-      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
-    }                                                                          \
-    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
-    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth,           \
-             NEG kHeight);                                                     \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth,       \
-               NEG kHeight);                                                   \
-    }                                                                          \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth,           \
-             NEG kHeight);                                                     \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
-      EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
-      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_argb);                                    \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-  }
-
-#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A)                           \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \
-           0)                                                                  \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned,  \
-           +, 1)                                                               \
-  TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0)
-
-TESTSYM(ARGBToARGB, 4, 4, 1)
-TESTSYM(ARGBToBGRA, 4, 4, 1)
-TESTSYM(ARGBToABGR, 4, 4, 1)
-TESTSYM(BGRAToARGB, 4, 4, 1)
-TESTSYM(ABGRToARGB, 4, 4, 1)
-
-TEST_F(LibYUVConvertTest, Test565) {
-  SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8_t pixels565[256][2]);
-
-  for (int i = 0; i < 256; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      orig_pixels[i][j] = i;
-    }
-  }
-  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
-  uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
-  EXPECT_EQ(610919429u, checksum);
-}
-
-#ifdef HAVE_JPEG
-TEST_F(LibYUVConvertTest, ValidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // Test special value that matches marker start.
-  memset(orig_pixels, 0xff, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // EOI, SOI. Expect pass.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
-  }
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  const int kMultiple = 10;
-  const int kBufSize = kImageSize * kMultiple + kOff;
-  align_buffer_page_end(orig_pixels, kBufSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kBufSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
-
-  // EOI, SOI. Expect pass.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
-  }
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, InvalidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
-                             ? benchmark_width_ * benchmark_height_
-                             : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_page_end(orig_pixels, kSize);
-
-  // NULL pointer. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
-
-  // Negative size. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
-
-  // Too large size. Expect fail.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // SOI but no EOI. Expect fail.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[2] = 0xff;
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-  }
-
-  // EOI but no SOI. Expect fail.
-  orig_pixels[0] = 0;
-  orig_pixels[1] = 0;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(LibYUVConvertTest, FuzzJpeg) {
-  // SOI but no EOI. Expect fail.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    const int kSize = fastrand() % 5000 + 3;
-    align_buffer_page_end(orig_pixels, kSize);
-    MemRandomize(orig_pixels, kSize);
-
-    // Add SOI so frame will be scanned.
-    orig_pixels[0] = 0xff;
-    orig_pixels[1] = 0xd8;  // SOI.
-    orig_pixels[2] = 0xff;
-    orig_pixels[kSize - 1] = 0xff;
-    ValidateJpeg(orig_pixels,
-                 kSize);  // Failure normally expected.
-    free_aligned_buffer_page_end(orig_pixels);
-  }
-}
-
-// Test data created in GIMP.  In export jpeg, disable
-// thumbnails etc, choose a subsampling, and use low quality
-// (50) to keep size small. Generated with xxd -i test.jpg
-// test 0 is J400
-static const uint8_t kTest0Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
-    0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
-    0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
-    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
-    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
-    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
-    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
-    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
-    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
-    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
-    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
-    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
-    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
-    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
-    0xd9};
-static const size_t kTest0JpgLen = 421;
-
-// test 1 is J444
-static const uint8_t kTest1Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
-    0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
-    0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
-    0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
-    0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
-    0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
-    0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
-    0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
-    0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
-    0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
-    0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
-    0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
-    0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
-    0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
-    0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
-    0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
-    0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
-    0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
-    0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
-    0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
-    0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
-    0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
-    0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
-    0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
-    0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
-    0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
-    0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
-    0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
-    0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
-    0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
-    0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
-    0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
-    0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
-    0xd4, 0xff, 0xd9};
-static const size_t kTest1JpgLen = 735;
-
-// test 2 is J420
-static const uint8_t kTest2Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
-    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
-    0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
-    0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
-    0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
-    0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
-    0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
-    0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
-    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
-    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
-    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
-    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
-    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
-    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
-    0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
-    0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
-    0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
-    0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
-    0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
-    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
-    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
-    0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
-    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
-    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
-    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
-    0xd9};
-static const size_t kTest2JpgLen = 685;
-
-// test 3 is J422
-static const uint8_t kTest3Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
-    0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
-    0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
-    0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
-    0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
-    0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
-    0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
-    0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
-    0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
-    0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
-    0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
-    0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
-    0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
-    0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
-    0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
-    0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
-    0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
-    0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
-    0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
-    0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
-    0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
-    0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
-    0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
-    0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest3JpgLen = 704;
-
-// test 4 is J422 vertical - not supported
-static const uint8_t kTest4Jpg[] = {
-    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
-    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
-    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
-    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
-    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
-    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
-    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
-    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
-    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
-    0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
-    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
-    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
-    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
-    0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
-    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
-    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
-    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
-    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
-    0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
-    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
-    0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
-    0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
-    0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
-    0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
-    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
-    0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
-    0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
-    0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
-    0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
-    0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
-    0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
-    0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
-    0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
-    0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
-    0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
-    0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
-    0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
-    0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
-    0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
-    0x0b, 0xb7, 0xd4, 0xff, 0xd9};
-static const size_t kTest4JpgLen = 701;
-
-TEST_F(LibYUVConvertTest, TestMJPGSize) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  printf("test jpeg size %d x %d\n", width, height);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_u, half_width * half_height);
-  align_buffer_page_end(dst_v, half_width * half_height);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
-                     dst_v, half_width, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
-  uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_u_hash, 2501859930u);
-  EXPECT_EQ(dst_v_hash, 2126459123u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  // Convert to NV21
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_vu, half_width * half_height * 2);
-
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert to I420
-  align_buffer_page_end(dst2_y, width * height);
-  align_buffer_page_end(dst2_u, half_width * half_height);
-  align_buffer_page_end(dst2_v, half_width * half_height);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
-                     dst2_v, half_width, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Convert I420 to NV21
-  align_buffer_page_end(dst3_y, width * height);
-  align_buffer_page_end(dst3_vu, half_width * half_height * 2);
-
-  I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
-             width, dst3_vu, half_width * 2, width, height);
-
-  for (int i = 0; i < width * height; ++i) {
-    EXPECT_EQ(dst_y[i], dst3_y[i]);
-  }
-  for (int i = 0; i < half_width * half_height * 2; ++i) {
-    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
-    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
-  }
-
-  free_aligned_buffer_page_end(dst3_y);
-  free_aligned_buffer_page_end(dst3_vu);
-
-  free_aligned_buffer_page_end(dst2_y);
-  free_aligned_buffer_page_end(dst2_u);
-  free_aligned_buffer_page_end(dst2_v);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_vu);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 1069662856u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 3543430771u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 330644005u);
-  EXPECT_EQ(dst_uv_hash, 135214341u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int half_width = (width + 1) / 2;
-  int half_height = (height + 1) / 2;
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_uv, half_width * half_height * 2);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
-                     half_width * 2, width, height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
-  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  EXPECT_EQ(dst_y_hash, 2682851208u);
-  EXPECT_EQ(dst_uv_hash, 506143297u);
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_uv);
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
-  int width = 0;
-  int height = 0;
-  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  EXPECT_EQ(0, ret);
-
-  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
-                             benchmark_height_ / (width * height);
-
-  align_buffer_page_end(dst_argb, width * height * 4);
-  for (int times = 0; times < benchmark_iterations; ++times) {
-    ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
-                     height, width, height);
-  }
-  // Expect sucesss
-  EXPECT_EQ(0, ret);
-
-  // Test result matches known hash value.
-  uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
-  EXPECT_EQ(dst_argb_hash, 2355976473u);
-
-  free_aligned_buffer_page_end(dst_argb);
-}
-
-static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
-  MJpegDecoder mjpeg_decoder;
-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-
-  int width = mjpeg_decoder.GetWidth();
-  int height = mjpeg_decoder.GetHeight();
-
-  // YUV420
-  if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-      mjpeg_decoder.GetNumComponents() == 3 &&
-      mjpeg_decoder.GetVertSampFactor(0) == 2 &&
-      mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-      mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-      mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-      mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-      mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J420, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV422
-  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-             mjpeg_decoder.GetNumComponents() == 3 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
-             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J422, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV444
-  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
-             mjpeg_decoder.GetNumComponents() == 3 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-    printf("JPeg is J444, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-    // YUV400
-  } else if (mjpeg_decoder.GetColorSpace() ==
-                 MJpegDecoder::kColorSpaceGrayscale &&
-             mjpeg_decoder.GetNumComponents() == 1 &&
-             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-             mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-    printf("JPeg is J400, %dx%d %d bytes\n", width, height,
-           static_cast<int>(sample_size));
-  } else {
-    // Unknown colorspace.
-    printf("JPeg is Unknown colorspace.\n");
-  }
-  mjpeg_decoder.UnloadFrame();
-  return ret;
-}
-
-TEST_F(LibYUVConvertTest, TestMJPGInfo) {
-  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
-  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
-                            kTest4JpgLen));  // Valid but unsupported.
-}
-#endif  // HAVE_JPEG
-
-TEST_F(LibYUVConvertTest, NV12Crop) {
-  const int SUBSAMP_X = 2;
-  const int SUBSAMP_Y = 2;
-  const int kWidth = benchmark_width_;
-  const int kHeight = benchmark_height_;
-  const int crop_y =
-      ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
-  const int kDestWidth = benchmark_width_;
-  const int kDestHeight = benchmark_height_ - crop_y * 2;
-  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int sample_size =
-      kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
-  align_buffer_page_end(src_y, sample_size);
-  uint8_t* src_uv = src_y + kWidth * kHeight;
-
-  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  for (int i = 0; i < kHeight * kWidth; ++i) {
-    src_y[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
-    src_uv[i] = (fastrand() & 0xff);
-  }
-  memset(dst_y, 1, kDestWidth * kDestHeight);
-  memset(dst_u, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_y_2, 1, kDestWidth * kDestHeight);
-  memset(dst_u_2, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v_2, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
-                SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
-                SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
-                kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
-
-  NV12ToI420(src_y + crop_y * kWidth, kWidth,
-             src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
-             kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
-             SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
-
-  for (int i = 0; i < kDestHeight; ++i) {
-    for (int j = 0; j < kDestWidth; ++j) {
-      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
-                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
-                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-  free_aligned_buffer_page_end(dst_y_2);
-  free_aligned_buffer_page_end(dst_u_2);
-  free_aligned_buffer_page_end(dst_v_2);
-  free_aligned_buffer_page_end(src_y);
-}
-
-TEST_F(LibYUVConvertTest, I420CropOddY) {
-  const int SUBSAMP_X = 2;
-  const int SUBSAMP_Y = 2;
-  const int kWidth = benchmark_width_;
-  const int kHeight = benchmark_height_;
-  const int crop_y = 1;
-  const int kDestWidth = benchmark_width_;
-  const int kDestHeight = benchmark_height_ - crop_y * 2;
-  const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
-  const int sample_size = kWidth * kHeight +
-                          kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
-                          kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
-  align_buffer_page_end(src_y, sample_size);
-  uint8_t* src_u = src_y + kWidth * kHeight;
-  uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
-
-  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
-  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
-                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  for (int i = 0; i < kHeight * kWidth; ++i) {
-    src_y[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
-    src_u[i] = (fastrand() & 0xff);
-  }
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
-    src_v[i] = (fastrand() & 0xff);
-  }
-  memset(dst_y, 1, kDestWidth * kDestHeight);
-  memset(dst_u, 2,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-  memset(dst_v, 3,
-         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
-
-  MaskCpuFlags(benchmark_cpu_info_);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
-                  SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
-                  SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
-                  kDestWidth, kDestHeight, libyuv::kRotate0,
-                  libyuv::FOURCC_I420);
-  }
-
-  for (int i = 0; i < kDestHeight; ++i) {
-    for (int j = 0; j < kDestWidth; ++j) {
-      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
-                dst_y[i * kDestWidth + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
-                dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
-    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
-                dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
-    }
-  }
-
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-  free_aligned_buffer_page_end(src_y);
-}
-
-TEST_F(LibYUVConvertTest, TestYToARGB) {
-  uint8_t y[32];
-  uint8_t expectedg[32];
-  for (int i = 0; i < 32; ++i) {
-    y[i] = i * 5 + 17;
-    expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
-  }
-  uint8_t argb[32 * 4];
-  YToARGB(y, 0, argb, 0, 32, 1);
-
-  for (int i = 0; i < 32; ++i) {
-    printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
-           argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
-  }
-  for (int i = 0; i < 32; ++i) {
-    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
-  }
-}
-
-static const uint8_t kNoDither4x4[16] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-TEST_F(LibYUVConvertTest, TestNoDither) {
-  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
-  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
-  align_buffer_page_end(dst_rgb565dither,
-                        benchmark_width_ * benchmark_height_ * 2);
-  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
-  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
-  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
-  ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
-               benchmark_width_, benchmark_height_);
-  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
-                     benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
-                     benchmark_height_);
-  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
-    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
-  }
-
-  free_aligned_buffer_page_end(src_argb);
-  free_aligned_buffer_page_end(dst_rgb565);
-  free_aligned_buffer_page_end(dst_rgb565dither);
-}
-
-// Ordered 4x4 dither for 888 to 565.  Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
-    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-TEST_F(LibYUVConvertTest, TestDither) {
-  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
-  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
-  align_buffer_page_end(dst_rgb565dither,
-                        benchmark_width_ * benchmark_height_ * 2);
-  align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
-  align_buffer_page_end(dst_argbdither,
-                        benchmark_width_ * benchmark_height_ * 4);
-  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
-  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
-  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
-  MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
-  MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
-  ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
-               benchmark_width_, benchmark_height_);
-  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
-                     benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
-                     benchmark_height_);
-  RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
-               benchmark_width_, benchmark_height_);
-  RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
-               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
-
-  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
-    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
-  }
-  free_aligned_buffer_page_end(src_argb);
-  free_aligned_buffer_page_end(dst_rgb565);
-  free_aligned_buffer_page_end(dst_rgb565dither);
-  free_aligned_buffer_page_end(dst_argb);
-  free_aligned_buffer_page_end(dst_argbdither);
-}
-
-#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)        \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      src_y[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      src_u[i + OFF] = (fastrand() & 0xff);                                    \
-      src_v[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
-                                  src_v + OFF, kStrideUV, dst_argb_c + OFF,    \
-                                  kStrideB, NULL, kWidth, NEG kHeight);        \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B##Dither(                                           \
-          src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
-          dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
-    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
-    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
-    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
-    FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
-                     kWidth, kHeight);                                         \
-    FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
-                     kWidth * BPP_C, kWidth, kHeight);                         \
-    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) -                   \
-                         static_cast<int>(dst_argb32_opt[i]));                 \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-    free_aligned_buffer_page_end(dst_argb32_c);                                \
-    free_aligned_buffer_page_end(dst_argb32_opt);                              \
-  }
-
-#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                       YALIGN, DIFF, FMT_C, BPP_C)                             \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C,       \
-                  BPP_C)                                                       \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C,     \
-                  BPP_C)                                                       \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
-  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
-                  YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
-
-#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
-  TEST_F(LibYUVConvertTest, NAME) {                                           \
-    const int kWidth = benchmark_width_;                                      \
-    const int kHeight = benchmark_height_;                                    \
-                                                                              \
-    align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
-    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
-    align_buffer_page_end(orig_u,                                             \
-                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
-    align_buffer_page_end(orig_v,                                             \
-                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
-                                                                              \
-    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
-    align_buffer_page_end(dst_uv_orig,                                        \
-                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
-                                                                              \
-    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
-    align_buffer_page_end(dst_uv,                                             \
-                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
-                                                                              \
-    MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);              \
-                                                                              \
-    /* Convert UYVY to NV12 in 2 steps for reference */                       \
-    libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth,   \
-                       orig_u, SUBSAMPLE(kWidth, 2), orig_v,                  \
-                       SUBSAMPLE(kWidth, 2), kWidth, kHeight);                \
-    libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v,  \
-                       SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
-                       2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);            \
-                                                                              \
-    /* Convert to NV12 */                                                     \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth,  \
-                         dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);  \
-    }                                                                         \
-                                                                              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      EXPECT_EQ(orig_y[i], dst_y[i]);                                         \
-    }                                                                         \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                     \
-    }                                                                         \
-    for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2);     \
-         ++i) {                                                               \
-      EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
-    }                                                                         \
-                                                                              \
-    free_aligned_buffer_page_end(orig_uyvy);                                  \
-    free_aligned_buffer_page_end(orig_y);                                     \
-    free_aligned_buffer_page_end(orig_u);                                     \
-    free_aligned_buffer_page_end(orig_v);                                     \
-    free_aligned_buffer_page_end(dst_y_orig);                                 \
-    free_aligned_buffer_page_end(dst_uv_orig);                                \
-    free_aligned_buffer_page_end(dst_y);                                      \
-    free_aligned_buffer_page_end(dst_uv);                                     \
-  }
-
-TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
-TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
-
-// Transitive tests.  A to B to C is same as A to C.
-
-#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
-                       W1280, N, NEG, OFF, FMT_C, BPP_C)                      \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {             \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      src_y[i + OFF] = (fastrand() & 0xff);                                   \
-    }                                                                         \
-    for (int i = 0; i < kSizeUV; ++i) {                                       \
-      src_u[i + OFF] = (fastrand() & 0xff);                                   \
-      src_v[i + OFF] = (fastrand() & 0xff);                                   \
-    }                                                                         \
-    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                          \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
-                            src_v + OFF, kStrideUV, dst_argb_b + OFF,         \
-                            kStrideB, kWidth, NEG kHeight);                   \
-    }                                                                         \
-    /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
-    const int kStrideC = kWidth * BPP_C;                                      \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
-    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
-    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
-    FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
-                          src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \
-                          kWidth, NEG kHeight);                               \
-    /* Convert B to C */                                                      \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
-                     kWidth, kHeight);                                        \
-    for (int i = 0; i < kStrideC * kHeight; ++i) {                            \
-      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
-    }                                                                         \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-    free_aligned_buffer_page_end(dst_argb_b);                                 \
-    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_bc);                                \
-  }
-
-#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
-                      FMT_C, BPP_C)                                          \
-  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                 benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)             \
-  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                 benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)           \
-  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                 benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)              \
-  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                 benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
-TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
-TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
-TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
-TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
-TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
-TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
-TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
-TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
-TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
-TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
-TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
-TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
-TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
-
-#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
-                        W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN)               \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {              \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    const int kSizeUV =                                                        \
-        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      src_y[i + OFF] = (fastrand() & 0xff);                                    \
-      src_a[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      src_u[i + OFF] = (fastrand() & 0xff);                                    \
-      src_v[i + OFF] = (fastrand() & 0xff);                                    \
-    }                                                                          \
-    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(                                                   \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),      \
-          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,      \
-          dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);             \
-    }                                                                          \
-    /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
-    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
-    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
-    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
-    FMT_PLANAR##To##FMT_C(                                                     \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
-        dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);               \
-    /* Convert B to C */                                                       \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC,  \
-                     kWidth, kHeight);                                         \
-    for (int i = 0; i < kStrideC * kHeight; ++i) {                             \
-      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(src_a);                                       \
-    free_aligned_buffer_page_end(dst_argb_b);                                  \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_bc);                                 \
-  }
-
-#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
-                       FMT_C, BPP_C)                                          \
-  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0)          \
-  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0)        \
-  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0)           \
-  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)              \
-  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
-                  benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
-
-TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
-TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
-
-#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
-                      OFF, FMT_C, BPP_C)                                       \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_##FMT_C##N) {                   \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
-    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
-    MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
-    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \
-                       kWidth, NEG kHeight);                                   \
-    }                                                                          \
-    /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
-    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
-    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
-    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
-    FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC,   \
-                     kWidth, NEG kHeight);                                     \
-    /* Convert B to C */                                                       \
-    FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC,  \
-                     kWidth, kHeight);                                         \
-    for (int i = 0; i < kStrideC * kHeight; i += 4) {                          \
-      EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
-      EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
-      EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
-      EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_argb_a);                                  \
-    free_aligned_buffer_page_end(dst_argb_b);                                  \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_bc);                                 \
-  }
-
-#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
-  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B,                    \
-                benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)              \
-  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
-                _Unaligned, +, 1, FMT_C, BPP_C)                              \
-  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
-                _Invert, -, 0, FMT_C, BPP_C)                                 \
-  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
-                _Opt, +, 0, FMT_C, BPP_C)
-
-// Caveat: Destination needs to be 4 bytes
-TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
-TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
-TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
-TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
-TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
-TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
-TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
-TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
-
-TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
-  // 2x2 frames
-  uint32_t src[4];
-  uint32_t dst[4];
-  // some random input
-  src[0] = 0x11000000;
-  src[1] = 0x00450000;
-  src[2] = 0x00009f00;
-  src[3] = 0x000000ff;
-  // zeros on destination
-  dst[0] = 0x00000000;
-  dst[1] = 0x00000000;
-  dst[2] = 0x00000000;
-  dst[3] = 0x00000000;
-
-  int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src),
-                        16,  // input size
-                        reinterpret_cast<uint8_t*>(dst),
-                        8,  // destination stride
-                        0,  // crop_x
-                        0,  // crop_y
-                        2,  // width
-                        2,  // height
-                        2,  // crop width
-                        2,  // crop height
-                        kRotate90, FOURCC_ARGB);
-
-  EXPECT_EQ(r, 0);
-  // 90 degrees rotation, no conversion
-  EXPECT_EQ(dst[0], src[2]);
-  EXPECT_EQ(dst[1], src[0]);
-  EXPECT_EQ(dst[2], src[3]);
-  EXPECT_EQ(dst[3], src[1]);
-}
-
-#ifdef HAS_ARGBTOAR30ROW_AVX2
-TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
-  // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels.
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
-  align_buffer_page_end(src, kPixels * 4);
-  align_buffer_page_end(dst_opt, kPixels * 4);
-  align_buffer_page_end(dst_c, kPixels * 4);
-  MemRandomize(src, kPixels * 4);
-  memset(dst_opt, 0, kPixels * 4);
-  memset(dst_c, 1, kPixels * 4);
-
-  ARGBToAR30Row_C(src, dst_c, kPixels);
-
-  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    if (has_avx2) {
-      ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
-    } else if (has_ssse3) {
-      ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
-    } else {
-      ARGBToAR30Row_C(src, dst_opt, kPixels);
-    }
-  }
-  for (int i = 0; i < kPixels * 4; ++i) {
-    EXPECT_EQ(dst_opt[i], dst_c[i]);
-  }
-
-  free_aligned_buffer_page_end(src);
-  free_aligned_buffer_page_end(dst_opt);
-  free_aligned_buffer_page_end(dst_c);
-}
-#endif  // HAS_ARGBTOAR30ROW_AVX2
-
-#ifdef HAS_ABGRTOAR30ROW_AVX2
-TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
-  // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
-  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
-  align_buffer_page_end(src, kPixels * 4);
-  align_buffer_page_end(dst_opt, kPixels * 4);
-  align_buffer_page_end(dst_c, kPixels * 4);
-  MemRandomize(src, kPixels * 4);
-  memset(dst_opt, 0, kPixels * 4);
-  memset(dst_c, 1, kPixels * 4);
-
-  ABGRToAR30Row_C(src, dst_c, kPixels);
-
-  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    if (has_avx2) {
-      ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
-    } else if (has_ssse3) {
-      ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
-    } else {
-      ABGRToAR30Row_C(src, dst_opt, kPixels);
-    }
-  }
-  for (int i = 0; i < kPixels * 4; ++i) {
-    EXPECT_EQ(dst_opt[i], dst_c[i]);
-  }
-
-  free_aligned_buffer_page_end(src);
-  free_aligned_buffer_page_end(dst_opt);
-  free_aligned_buffer_page_end(dst_c);
-}
-#endif  // HAS_ABGRTOAR30ROW_AVX2
-
-// TODO(fbarchard): Fix clamping issue affected by U channel.
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,   \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF)   \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                    \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                        \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);               \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                  \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                    \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);         \
-    const int kBpc = 2;                                                    \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);            \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                    \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                    \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);           \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);         \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                           \
-      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
-    }                                                                      \
-    for (int i = 0; i < kSizeUV; ++i) {                                    \
-      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
-      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
-    }                                                                      \
-    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                      \
-    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                  \
-    MaskCpuFlags(disable_cpu_flags_);                                      \
-    FMT_PLANAR##To##FMT_B(                                                 \
-        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                 \
-        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,              \
-        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,              \
-        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                 \
-    MaskCpuFlags(benchmark_cpu_info_);                                     \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                      \
-      FMT_PLANAR##To##FMT_B(                                               \
-          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,               \
-          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,            \
-          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,            \
-          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);             \
-    }                                                                      \
-    int max_diff = 0;                                                      \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                   \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -          \
-                         static_cast<int>(dst_argb_opt[i + DOFF]));        \
-      if (abs_diff > max_diff) {                                           \
-        max_diff = abs_diff;                                               \
-      }                                                                    \
-    }                                                                      \
-    EXPECT_LE(max_diff, DIFF);                                             \
-    free_aligned_buffer_page_end(src_y);                                   \
-    free_aligned_buffer_page_end(src_u);                                   \
-    free_aligned_buffer_page_end(src_v);                                   \
-    free_aligned_buffer_page_end(dst_argb_c);                              \
-    free_aligned_buffer_page_end(dst_argb_opt);                            \
-  }
-
-#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                        YALIGN, DIFF)                                          \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)          \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1)        \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)           \
-  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)
-
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
-
-static int Clamp(int y) {
-  if (y < 0) {
-    y = 0;
-  }
-  if (y > 255) {
-    y = 255;
-  }
-  return y;
-}
-
-static int Clamp10(int y) {
-  if (y < 0) {
-    y = 0;
-  }
-  if (y > 1023) {
-    y = 1023;
-  }
-  return y;
-}
-
-// Test 8 bit YUV to 8 bit RGB
-TEST_F(LibYUVConvertTest, TestH420ToARGB) {
-  const int kSize = 256;
-  int histogram_b[256];
-  int histogram_g[256];
-  int histogram_r[256];
-  memset(histogram_b, 0, sizeof(histogram_b));
-  memset(histogram_g, 0, sizeof(histogram_g));
-  memset(histogram_r, 0, sizeof(histogram_r));
-  align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
-  align_buffer_page_end(argb_pixels, kSize * 4);
-  uint8_t* orig_y = orig_yuv;
-  uint8_t* orig_u = orig_y + kSize;
-  uint8_t* orig_v = orig_u + kSize / 2;
-
-  // Test grey scale
-  for (int i = 0; i < kSize; ++i) {
-    orig_y[i] = i;
-  }
-  for (int i = 0; i < kSize / 2; ++i) {
-    orig_u[i] = 128;  // 128 is 0.
-    orig_v[i] = 128;
-  }
-
-  H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
-
-  for (int i = 0; i < kSize; ++i) {
-    int b = argb_pixels[i * 4 + 0];
-    int g = argb_pixels[i * 4 + 1];
-    int r = argb_pixels[i * 4 + 2];
-    int a = argb_pixels[i * 4 + 3];
-    ++histogram_b[b];
-    ++histogram_g[g];
-    ++histogram_r[r];
-    int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f));
-    EXPECT_NEAR(b, expected_y, 1);
-    EXPECT_NEAR(g, expected_y, 1);
-    EXPECT_NEAR(r, expected_y, 1);
-    EXPECT_EQ(a, 255);
-  }
-
-  int count_b = 0;
-  int count_g = 0;
-  int count_r = 0;
-  for (int i = 0; i < kSize; ++i) {
-    if (histogram_b[i]) {
-      ++count_b;
-    }
-    if (histogram_g[i]) {
-      ++count_g;
-    }
-    if (histogram_r[i]) {
-      ++count_r;
-    }
-  }
-  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
-  free_aligned_buffer_page_end(orig_yuv);
-  free_aligned_buffer_page_end(argb_pixels);
-}
-
-// Test 10 bit YUV to 8 bit RGB
-TEST_F(LibYUVConvertTest, TestH010ToARGB) {
-  const int kSize = 1024;
-  int histogram_b[1024];
-  int histogram_g[1024];
-  int histogram_r[1024];
-  memset(histogram_b, 0, sizeof(histogram_b));
-  memset(histogram_g, 0, sizeof(histogram_g));
-  memset(histogram_r, 0, sizeof(histogram_r));
-  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
-  align_buffer_page_end(argb_pixels, kSize * 4);
-  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
-  uint16_t* orig_u = orig_y + kSize;
-  uint16_t* orig_v = orig_u + kSize / 2;
-
-  // Test grey scale
-  for (int i = 0; i < kSize; ++i) {
-    orig_y[i] = i;
-  }
-  for (int i = 0; i < kSize / 2; ++i) {
-    orig_u[i] = 512;  // 512 is 0.
-    orig_v[i] = 512;
-  }
-
-  H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
-
-  for (int i = 0; i < kSize; ++i) {
-    int b = argb_pixels[i * 4 + 0];
-    int g = argb_pixels[i * 4 + 1];
-    int r = argb_pixels[i * 4 + 2];
-    int a = argb_pixels[i * 4 + 3];
-    ++histogram_b[b];
-    ++histogram_g[g];
-    ++histogram_r[r];
-    int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
-    EXPECT_NEAR(b, expected_y, 1);
-    EXPECT_NEAR(g, expected_y, 1);
-    EXPECT_NEAR(r, expected_y, 1);
-    EXPECT_EQ(a, 255);
-  }
-
-  int count_b = 0;
-  int count_g = 0;
-  int count_r = 0;
-  for (int i = 0; i < kSize; ++i) {
-    if (histogram_b[i]) {
-      ++count_b;
-    }
-    if (histogram_g[i]) {
-      ++count_g;
-    }
-    if (histogram_r[i]) {
-      ++count_r;
-    }
-  }
-  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
-  free_aligned_buffer_page_end(orig_yuv);
-  free_aligned_buffer_page_end(argb_pixels);
-}
-
-// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected
-// result.
-TEST_F(LibYUVConvertTest, TestH010ToAR30) {
-  const int kSize = 1024;
-  int histogram_b[1024];
-  int histogram_g[1024];
-  int histogram_r[1024];
-  memset(histogram_b, 0, sizeof(histogram_b));
-  memset(histogram_g, 0, sizeof(histogram_g));
-  memset(histogram_r, 0, sizeof(histogram_r));
-
-  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
-  align_buffer_page_end(ar30_pixels, kSize * 4);
-  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
-  uint16_t* orig_u = orig_y + kSize;
-  uint16_t* orig_v = orig_u + kSize / 2;
-
-  // Test grey scale
-  for (int i = 0; i < kSize; ++i) {
-    orig_y[i] = i;
-  }
-  for (int i = 0; i < kSize / 2; ++i) {
-    orig_u[i] = 512;  // 512 is 0.
-    orig_v[i] = 512;
-  }
-
-  H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
-
-  for (int i = 0; i < kSize; ++i) {
-    int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
-    int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
-    int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
-    int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
-    ++histogram_b[b10];
-    ++histogram_g[g10];
-    ++histogram_r[r10];
-    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
-    EXPECT_NEAR(b10, expected_y, 4);
-    EXPECT_NEAR(g10, expected_y, 4);
-    EXPECT_NEAR(r10, expected_y, 4);
-    EXPECT_EQ(a2, 3);
-  }
-
-  int count_b = 0;
-  int count_g = 0;
-  int count_r = 0;
-  for (int i = 0; i < kSize; ++i) {
-    if (histogram_b[i]) {
-      ++count_b;
-    }
-    if (histogram_g[i]) {
-      ++count_g;
-    }
-    if (histogram_r[i]) {
-      ++count_r;
-    }
-  }
-  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
-  free_aligned_buffer_page_end(orig_yuv);
-  free_aligned_buffer_page_end(ar30_pixels);
-}
-
-// Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected
-// result.
-TEST_F(LibYUVConvertTest, TestH010ToAB30) {
-  const int kSize = 1024;
-  int histogram_b[1024];
-  int histogram_g[1024];
-  int histogram_r[1024];
-  memset(histogram_b, 0, sizeof(histogram_b));
-  memset(histogram_g, 0, sizeof(histogram_g));
-  memset(histogram_r, 0, sizeof(histogram_r));
-
-  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
-  align_buffer_page_end(ab30_pixels, kSize * 4);
-  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
-  uint16_t* orig_u = orig_y + kSize;
-  uint16_t* orig_v = orig_u + kSize / 2;
-
-  // Test grey scale
-  for (int i = 0; i < kSize; ++i) {
-    orig_y[i] = i;
-  }
-  for (int i = 0; i < kSize / 2; ++i) {
-    orig_u[i] = 512;  // 512 is 0.
-    orig_v[i] = 512;
-  }
-
-  H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1);
-
-  for (int i = 0; i < kSize; ++i) {
-    int r10 = reinterpret_cast<uint32_t*>(ab30_pixels)[i] & 1023;
-    int g10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 10) & 1023;
-    int b10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 20) & 1023;
-    int a2 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 30) & 3;
-    ++histogram_b[b10];
-    ++histogram_g[g10];
-    ++histogram_r[r10];
-    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
-    EXPECT_NEAR(b10, expected_y, 4);
-    EXPECT_NEAR(g10, expected_y, 4);
-    EXPECT_NEAR(r10, expected_y, 4);
-    EXPECT_EQ(a2, 3);
-  }
-
-  int count_b = 0;
-  int count_g = 0;
-  int count_r = 0;
-  for (int i = 0; i < kSize; ++i) {
-    if (histogram_b[i]) {
-      ++count_b;
-    }
-    if (histogram_g[i]) {
-      ++count_g;
-    }
-    if (histogram_r[i]) {
-      ++count_r;
-    }
-  }
-  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
-  free_aligned_buffer_page_end(orig_yuv);
-  free_aligned_buffer_page_end(ab30_pixels);
-}
-
-// Test 8 bit YUV to 10 bit RGB
-TEST_F(LibYUVConvertTest, TestH420ToAR30) {
-  const int kSize = 256;
-  const int kHistSize = 1024;
-  int histogram_b[kHistSize];
-  int histogram_g[kHistSize];
-  int histogram_r[kHistSize];
-  memset(histogram_b, 0, sizeof(histogram_b));
-  memset(histogram_g, 0, sizeof(histogram_g));
-  memset(histogram_r, 0, sizeof(histogram_r));
-  align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
-  align_buffer_page_end(ar30_pixels, kSize * 4);
-  uint8_t* orig_y = orig_yuv;
-  uint8_t* orig_u = orig_y + kSize;
-  uint8_t* orig_v = orig_u + kSize / 2;
-
-  // Test grey scale
-  for (int i = 0; i < kSize; ++i) {
-    orig_y[i] = i;
-  }
-  for (int i = 0; i < kSize / 2; ++i) {
-    orig_u[i] = 128;  // 128 is 0.
-    orig_v[i] = 128;
-  }
-
-  H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
-
-  for (int i = 0; i < kSize; ++i) {
-    int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
-    int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
-    int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
-    int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
-    ++histogram_b[b10];
-    ++histogram_g[g10];
-    ++histogram_r[r10];
-    int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
-    EXPECT_NEAR(b10, expected_y, 4);
-    EXPECT_NEAR(g10, expected_y, 4);
-    EXPECT_NEAR(r10, expected_y, 4);
-    EXPECT_EQ(a2, 3);
-  }
-
-  int count_b = 0;
-  int count_g = 0;
-  int count_r = 0;
-  for (int i = 0; i < kHistSize; ++i) {
-    if (histogram_b[i]) {
-      ++count_b;
-    }
-    if (histogram_g[i]) {
-      ++count_g;
-    }
-    if (histogram_r[i]) {
-      ++count_r;
-    }
-  }
-  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
-
-  free_aligned_buffer_page_end(orig_yuv);
-  free_aligned_buffer_page_end(ar30_pixels);
-}
-
-// Test RGB24 to ARGB and back to RGB24
-TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
-  const int kSize = 256;
-  align_buffer_page_end(orig_rgb24, kSize * 3);
-  align_buffer_page_end(argb_pixels, kSize * 4);
-  align_buffer_page_end(dest_rgb24, kSize * 3);
-
-  // Test grey scale
-  for (int i = 0; i < kSize * 3; ++i) {
-    orig_rgb24[i] = i;
-  }
-
-  RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
-  ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
-
-  for (int i = 0; i < kSize * 3; ++i) {
-    EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
-  }
-
-  free_aligned_buffer_page_end(orig_rgb24);
-  free_aligned_buffer_page_end(argb_pixels);
-  free_aligned_buffer_page_end(dest_rgb24);
-}
-
-}  // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
deleted file mode 100644
index a7991d2b..00000000
--- a/files/unit_test/cpu_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "../unit_test/unit_test.h"
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/version.h"
-
-namespace libyuv {
-
-TEST_F(LibYUVBaseTest, TestCpuHas) {
-  int cpu_flags = TestCpuFlag(-1);
-  printf("Cpu Flags %d\n", cpu_flags);
-#if defined(__arm__) || defined(__aarch64__)
-  int has_arm = TestCpuFlag(kCpuHasARM);
-  printf("Has ARM %d\n", has_arm);
-  int has_neon = TestCpuFlag(kCpuHasNEON);
-  printf("Has NEON %d\n", has_neon);
-#endif
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
-  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
-  int has_avx = TestCpuFlag(kCpuHasAVX);
-  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-  int has_erms = TestCpuFlag(kCpuHasERMS);
-  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-  int has_f16c = TestCpuFlag(kCpuHasF16C);
-  int has_gfni = TestCpuFlag(kCpuHasGFNI);
-  int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
-  int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
-  int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
-  int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
-  int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-  int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-  printf("Has X86 %d\n", has_x86);
-  printf("Has SSE2 %d\n", has_sse2);
-  printf("Has SSSE3 %d\n", has_ssse3);
-  printf("Has SSE41 %d\n", has_sse41);
-  printf("Has SSE42 %d\n", has_sse42);
-  printf("Has AVX %d\n", has_avx);
-  printf("Has AVX2 %d\n", has_avx2);
-  printf("Has ERMS %d\n", has_erms);
-  printf("Has FMA3 %d\n", has_fma3);
-  printf("Has F16C %d\n", has_f16c);
-  printf("Has GFNI %d\n", has_gfni);
-  printf("Has AVX512BW %d\n", has_avx512bw);
-  printf("Has AVX512VL %d\n", has_avx512vl);
-  printf("Has AVX512VBMI %d\n", has_avx512vbmi);
-  printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2);
-  printf("Has AVX512VBITALG %d\n", has_avx512vbitalg);
-  printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq);
-
-#if defined(__mips__)
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
-  printf("Has MIPS %d\n", has_mips);
-  int has_msa = TestCpuFlag(kCpuHasMSA);
-  printf("Has MSA %d\n", has_msa);
-  int has_mmi = TestCpuFlag(kCpuHasMMI);
-  printf("Has MMI %d\n", has_mmi);
-#endif
-}
-
-TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
-#if defined(__aarch64__)
-  printf("Arm64 build\n");
-#endif
-#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
-  printf("Neon build enabled\n");
-#endif
-#if defined(__x86_64__) || defined(_M_X64)
-  printf("x64 build\n");
-#endif
-#ifdef _MSC_VER
-  printf("_MSC_VER %d\n", _MSC_VER);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) &&                      \
-    (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(VISUALC_HAS_AVX2))
-  printf("Has AVX2 1\n");
-#else
-  printf("Has AVX2 0\n");
-// If compiler does not support AVX2, the following function not expected:
-#endif
-}
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
-    defined(_M_X64)
-TEST_F(LibYUVBaseTest, TestCpuId) {
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  if (has_x86) {
-    int cpu_info[4];
-    // Vendor ID:
-    // AuthenticAMD AMD processor
-    // CentaurHauls Centaur processor
-    // CyrixInstead Cyrix processor
-    // GenuineIntel Intel processor
-    // GenuineTMx86 Transmeta processor
-    // Geode by NSC National Semiconductor processor
-    // NexGenDriven NexGen processor
-    // RiseRiseRise Rise Technology processor
-    // SiS SiS SiS  SiS processor
-    // UMC UMC UMC  UMC processor
-    CpuId(0, 0, cpu_info);
-    cpu_info[0] = cpu_info[1];  // Reorder output
-    cpu_info[1] = cpu_info[3];
-    cpu_info[3] = 0;
-    printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
-           cpu_info[0], cpu_info[1], cpu_info[2]);
-    EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
-
-    // CPU Family and Model
-    // 3:0 - Stepping
-    // 7:4 - Model
-    // 11:8 - Family
-    // 13:12 - Processor Type
-    // 19:16 - Extended Model
-    // 27:20 - Extended Family
-    CpuId(1, 0, cpu_info);
-    int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
-    int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
-    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
-           model);
-  }
-}
-#endif
-
-static int FileExists(const char* file_name) {
-  FILE* f = fopen(file_name, "r");
-  if (!f) {
-    return 0;
-  }
-  fclose(f);
-  return 1;
-}
-
-TEST_F(LibYUVBaseTest, TestLinuxNeon) {
-  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
-    printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
-
-    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
-    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
-    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
-  } else {
-    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
-  }
-#if defined(__linux__) && defined(__ARM_NEON__)
-  EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
-#endif
-}
-
-TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
-  // Reset any masked flags that may have been set so auto init is enabled.
-  MaskCpuFlags(0);
-
-  int original_cpu_flags = TestCpuFlag(-1);
-
-  // Test setting different CPU configurations.
-  int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
-  SetCpuFlags(cpu_flags);
-  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
-
-  cpu_flags = kCpuHasX86 | kCpuInitialized;
-  SetCpuFlags(cpu_flags);
-  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
-
-  // Test that setting 0 turns auto-init back on.
-  SetCpuFlags(0);
-  EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
-
-  // Restore the CPU flag mask.
-  MaskCpuFlags(benchmark_cpu_info_);
-}
-
-}  // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
deleted file mode 100644
index 61941e63..00000000
--- a/files/unit_test/rotate_test.cc
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "../unit_test/unit_test.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/rotate.h"
-
-namespace libyuv {
-
-static void I420TestRotate(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags,
-                           int benchmark_cpu_info) {
-  if (src_width < 1) {
-    src_width = 1;
-  }
-  if (src_height == 0) {
-    src_height = 1;
-  }
-  if (dst_width < 1) {
-    dst_width = 1;
-  }
-  if (dst_height < 1) {
-    dst_height = 1;
-  }
-  int src_i420_y_size = src_width * Abs(src_height);
-  int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
-  int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
-  align_buffer_page_end(src_i420, src_i420_size);
-  for (int i = 0; i < src_i420_size; ++i) {
-    src_i420[i] = fastrand() & 0xff;
-  }
-
-  int dst_i420_y_size = dst_width * dst_height;
-  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
-  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
-  align_buffer_page_end(dst_i420_c, dst_i420_size);
-  align_buffer_page_end(dst_i420_opt, dst_i420_size);
-  memset(dst_i420_c, 2, dst_i420_size);
-  memset(dst_i420_opt, 3, dst_i420_size);
-
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
-             (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
-             (src_width + 1) / 2, dst_i420_c, dst_width,
-             dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
-             dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
-             (dst_width + 1) / 2, src_width, src_height, mode);
-
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  for (int i = 0; i < benchmark_iterations; ++i) {
-    I420Rotate(
-        src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
-        src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
-        dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
-        (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
-        (dst_width + 1) / 2, src_width, src_height, mode);
-  }
-
-  // Rotation should be exact.
-  for (int i = 0; i < dst_i420_size; ++i) {
-    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
-  }
-
-  free_aligned_buffer_page_end(dst_i420_c);
-  free_aligned_buffer_page_end(dst_i420_opt);
-  free_aligned_buffer_page_end(src_i420);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
-  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
-  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
-  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
-  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-// TODO(fbarchard): Remove odd width tests.
-// Odd width tests work but disabled because they use C code and can be
-// tested by passing an odd width command line or environment variable.
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
-  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-static void I444TestRotate(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags,
-                           int benchmark_cpu_info) {
-  if (src_width < 1) {
-    src_width = 1;
-  }
-  if (src_height == 0) {
-    src_height = 1;
-  }
-  if (dst_width < 1) {
-    dst_width = 1;
-  }
-  if (dst_height < 1) {
-    dst_height = 1;
-  }
-  int src_i444_y_size = src_width * Abs(src_height);
-  int src_i444_uv_size = src_width * Abs(src_height);
-  int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
-  align_buffer_page_end(src_i444, src_i444_size);
-  for (int i = 0; i < src_i444_size; ++i) {
-    src_i444[i] = fastrand() & 0xff;
-  }
-
-  int dst_i444_y_size = dst_width * dst_height;
-  int dst_i444_uv_size = dst_width * dst_height;
-  int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
-  align_buffer_page_end(dst_i444_c, dst_i444_size);
-  align_buffer_page_end(dst_i444_opt, dst_i444_size);
-  memset(dst_i444_c, 2, dst_i444_size);
-  memset(dst_i444_opt, 3, dst_i444_size);
-
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
-             src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
-             dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
-             dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
-             src_width, src_height, mode);
-
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  for (int i = 0; i < benchmark_iterations; ++i) {
-    I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
-               src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
-               dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
-               dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
-               dst_width, src_width, src_height, mode);
-  }
-
-  // Rotation should be exact.
-  for (int i = 0; i < dst_i444_size; ++i) {
-    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
-  }
-
-  free_aligned_buffer_page_end(dst_i444_c);
-  free_aligned_buffer_page_end(dst_i444_opt);
-  free_aligned_buffer_page_end(src_i444);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
-  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
-  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
-  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
-  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-// TODO(fbarchard): Remove odd width tests.
-// Odd width tests work but disabled because they use C code and can be
-// tested by passing an odd width command line or environment variable.
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
-  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-static void NV12TestRotate(int src_width,
-                           int src_height,
-                           int dst_width,
-                           int dst_height,
-                           libyuv::RotationMode mode,
-                           int benchmark_iterations,
-                           int disable_cpu_flags,
-                           int benchmark_cpu_info) {
-  if (src_width < 1) {
-    src_width = 1;
-  }
-  if (src_height == 0) {  // allow negative for inversion test.
-    src_height = 1;
-  }
-  if (dst_width < 1) {
-    dst_width = 1;
-  }
-  if (dst_height < 1) {
-    dst_height = 1;
-  }
-  int src_nv12_y_size = src_width * Abs(src_height);
-  int src_nv12_uv_size =
-      ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
-  int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
-  align_buffer_page_end(src_nv12, src_nv12_size);
-  for (int i = 0; i < src_nv12_size; ++i) {
-    src_nv12[i] = fastrand() & 0xff;
-  }
-
-  int dst_i420_y_size = dst_width * dst_height;
-  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
-  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
-  align_buffer_page_end(dst_i420_c, dst_i420_size);
-  align_buffer_page_end(dst_i420_opt, dst_i420_size);
-  memset(dst_i420_c, 2, dst_i420_size);
-  memset(dst_i420_opt, 3, dst_i420_size);
-
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
-                   (src_width + 1) & ~1, dst_i420_c, dst_width,
-                   dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
-                   dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
-                   (dst_width + 1) / 2, src_width, src_height, mode);
-
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  for (int i = 0; i < benchmark_iterations; ++i) {
-    NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
-                     (src_width + 1) & ~1, dst_i420_opt, dst_width,
-                     dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
-                     dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
-                     (dst_width + 1) / 2, src_width, src_height, mode);
-  }
-
-  // Rotation should be exact.
-  for (int i = 0; i < dst_i420_size; ++i) {
-    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
-  }
-
-  free_aligned_buffer_page_end(dst_i420_c);
-  free_aligned_buffer_page_end(dst_i420_opt);
-  free_aligned_buffer_page_end(src_nv12);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
-  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
-  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
-  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
-  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
-  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
-                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
-                 benchmark_iterations_, disable_cpu_flags_,
-                 benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate0, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate90, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
-                 benchmark_height_, kRotate180, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
-  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
-                 benchmark_width_, kRotate270, benchmark_iterations_,
-                 disable_cpu_flags_, benchmark_cpu_info_);
-}
-
-}  // namespace libyuv
diff --git a/files/util/android/test_runner.py b/files/util/android/test_runner.py
deleted file mode 100755
index 8b06b7ea..00000000
--- a/files/util/android/test_runner.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-"""
-Runs tests on Android devices.
-
-This script exists to avoid Libyuv being broken by changes in the Chrome Android
-test execution toolchain. It also conveniently sets the CHECKOUT_SOURCE_ROOT
-environment variable.
-"""
-
-import os
-import sys
-
-SCRIPT_DIR = os.path.dirname(__file__)
-ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))
-CHROMIUM_BUILD_ANDROID_DIR = os.path.join(ROOT_DIR, 'build', 'android')
-sys.path.insert(0, CHROMIUM_BUILD_ANDROID_DIR)
-
-
-import test_runner  # pylint: disable=W0406
-
-def main():
-  # Override environment variable to make it possible for the scripts to find
-  # the root directory (our symlinking of the Chromium build toolchain would
-  # otherwise make them fail to do so).
-  os.environ['CHECKOUT_SOURCE_ROOT'] = ROOT_DIR
-  return test_runner.main()
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/files/fuzz/Android.bp b/fuzz/Android.bp
index 0e495899..a8d552b1 100644
--- a/files/fuzz/Android.bp
+++ b/fuzz/Android.bp
@@ -2,10 +2,10 @@
 package {
     // See: http://go/android-license-faq
     // A large-scale-change added 'default_applicable_licenses' to import
-    // all of the 'license_kinds' from "external_libyuv_files_license"
+    // all of the 'license_kinds' from "external_libyuv_license"
     // to get the below license kinds:
     //   SPDX-license-identifier-BSD
-    default_applicable_licenses: ["external_libyuv_files_license"],
+    default_applicable_licenses: ["external_libyuv_license"],
 }
 
 cc_fuzz {
diff --git a/files/fuzz/OWNERS b/fuzz/OWNERS
index 37481f5d..37481f5d 100644
--- a/files/fuzz/OWNERS
+++ b/fuzz/OWNERS
diff --git a/files/fuzz/mjpeg_dec_fuzz.cc b/fuzz/mjpeg_dec_fuzz.cc
index 3be8410a..3be8410a 100644
--- a/files/fuzz/mjpeg_dec_fuzz.cc
+++ b/fuzz/mjpeg_dec_fuzz.cc
diff --git a/files/include/libyuv.h b/include/libyuv.h
index aeffd5ef..a06e1233 100644
--- a/files/include/libyuv.h
+++ b/include/libyuv.h
@@ -26,6 +26,7 @@
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
 #include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"
 #include "libyuv/version.h"
 #include "libyuv/video_common.h"
 
diff --git a/files/include/libyuv/basic_types.h b/include/libyuv/basic_types.h
index 1bea67f2..1bea67f2 100644
--- a/files/include/libyuv/basic_types.h
+++ b/include/libyuv/basic_types.h
diff --git a/files/include/libyuv/compare.h b/include/libyuv/compare.h
index 3353ad71..3353ad71 100644
--- a/files/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
diff --git a/files/include/libyuv/compare_row.h b/include/libyuv/compare_row.h
index e95b9d93..8293c919 100644
--- a/files/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
@@ -28,7 +28,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -55,20 +58,20 @@ extern "C" {
 
 // The following are available for Visual C and clangcl 32 bit:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__) &&                                                   \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
 #define HAS_HASHDJB2_AVX2
 #define HAS_SUMSQUAREERROR_AVX2
 #endif
 
-// The following are available for GCC and clangcl 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+// The following are available for GCC and clangcl:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_HAMMINGDISTANCE_SSSE3
 #endif
 
-// The following are available for GCC and clangcl 64 bit:
+// The following are available for GCC and clangcl:
 #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+    (defined(__x86_64__) || defined(__i386__))
 #define HAS_HAMMINGDISTANCE_AVX2
 #endif
 
@@ -84,11 +87,6 @@ extern "C" {
 #define HAS_SUMSQUAREERROR_MSA
 #endif
 
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_HAMMINGDISTANCE_MMI
-#define HAS_SUMSQUAREERROR_MMI
-#endif
-
 uint32_t HammingDistance_C(const uint8_t* src_a,
                            const uint8_t* src_b,
                            int count);
@@ -107,9 +105,6 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
 uint32_t HammingDistance_MSA(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count);
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count);
 uint32_t SumSquareError_C(const uint8_t* src_a,
                           const uint8_t* src_b,
                           int count);
@@ -125,9 +120,6 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
 uint32_t SumSquareError_MSA(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count);
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count);
 
 uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
new file mode 100644
index 00000000..88619a4f
--- /dev/null
+++ b/include/libyuv/convert.h
@@ -0,0 +1,1045 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.
+
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I422 to I444.
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I422 to I210.
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert MM21 to NV12.
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert MM21 to I420.
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert MM21 to YUY2
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+
+// Convert MT2T to P010
+// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will
+// be 10 / 8 times the dimensions of the image. Also for this reason,
+// src_stride_y and src_stride_uv are given in bytes.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert I420 to I444.
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H210ToH420 I210ToI420
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H210ToH422 I210ToI422
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H410ToH420 I410ToI420
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H410ToH444 I410ToI444
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H012ToH420 I012ToI420
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H212ToH422 I212ToI422
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H412ToH444 I412ToI444
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define I412ToI012 I410ToI010
+#define H410ToH010 I410ToI010
+#define H412ToH012 I410ToI010
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define I212ToI012 I210ToI010
+#define H210ToH010 I210ToI010
+#define H212ToH012 I210ToI010
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I010 to I410
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I012 to I412
+#define I012ToI412 I010ToI410
+
+// Convert I210 to I410
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I212 to I412
+#define I212ToI412 I210ToI410
+
+// Convert I010 to P010
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I210 to P210
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I012 to P012
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I212 to P212
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert P010 to I010.
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert P012 to I012.
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert P010 to P410.
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert P012 to P412.
+#define P012ToP412 P010ToP410
+
+// Convert P016 to P416.
+#define P016ToP416 P010ToP410
+
+// Convert P210 to P410.
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert P212 to P412.
+#define P212ToP412 P210ToP410
+
+// Convert P216 to P416.
+#define P216ToP416 P210ToP410
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+
+// RGB big endian (rgb in memory) to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
+
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height);
+
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
new file mode 100644
index 00000000..35eeac9b
--- /dev/null
+++ b/include/libyuv/convert_argb.h
@@ -0,0 +1,2315 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.
+#include "libyuv/scale.h"   // For enum FilterMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;   // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants;  // BT.2020 full
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuvF709ConstantsVU kYvuF709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+  I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I422 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I422AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I422 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I444 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I444 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV12 to ABGR.
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AB64 to ABGR.
+#define AB64ToABGR AR64ToARGB
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+               int src_stride_ab64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR64 to ABGR.
+#define AR64ToABGR AB64ToARGB
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+
+// Convert AB64 To AR64.
+#define AB64ToAR64 AR64ToAB64
+
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 420 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert 10 bit 444 YUV to ARGB with matrix.
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P010 to ARGB with matrix.
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P210 to ARGB with matrix.
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P010 to AR30 with matrix.
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert P210 to AR30 with matrix.
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// P012 and P010 use most significant bits so the conversion is the same.
+// Convert P012 to ARGB with matrix.
+#define P012ToARGBMatrix P010ToARGBMatrix
+// Convert P012 to AR30 with matrix.
+#define P012ToAR30Matrix P010ToAR30Matrix
+// Convert P212 to ARGB with matrix.
+#define P212ToARGBMatrix P210ToARGBMatrix
+// Convert P212 to AR30 with matrix.
+#define P212ToAR30Matrix P210ToAR30Matrix
+
+// Convert P016 to ARGB with matrix.
+#define P016ToARGBMatrix P010ToARGBMatrix
+// Convert P016 to AR30 with matrix.
+#define P016ToAR30Matrix P010ToAR30Matrix
+// Convert P216 to ARGB with matrix.
+#define P216ToARGBMatrix P210ToARGBMatrix
+// Convert P216 to AR30 with matrix.
+#define P216ToAR30Matrix P210ToAR30Matrix
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height);
+
+// Convert I420 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I422 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I422 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
+// Convert I420 to RGB24 with matrix and UV filter mode.
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter);
+
+// Convert I010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert I420 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+
+// Convert I422 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+
+// Convert I010 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+
+// Convert I210 with Alpha to attenuated ARGB with matrix and UV filter mode.
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter);
+
+// Convert P010 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert P210 to ARGB with matrix and UV filter mode.
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert P010 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert P210 to AR30 with matrix and UV filter mode.
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter);
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "sample_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/files/include/libyuv/convert_from.h b/include/libyuv/convert_from.h
index 861418d0..32f42a63 100644
--- a/files/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@@ -23,6 +23,7 @@ extern "C" {
 
 // Convert 8 bit YUV to 10 bit.
 #define H420ToH010 I420ToI010
+LIBYUV_API
 int I420ToI010(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_u,
@@ -38,6 +39,24 @@ int I420ToI010(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert 8 bit YUV to 12 bit.
+#define H420ToH012 I420ToI012
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 LIBYUV_API
 int I420ToI422(const uint8_t* src_y,
                int src_stride_y,
@@ -131,6 +150,10 @@ int I420ToUYVY(const uint8_t* src_y,
                int width,
                int height);
 
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future.  Use convert_argb.h
+
+// Convert I420 to ARGB.
 LIBYUV_API
 int I420ToARGB(const uint8_t* src_y,
                int src_stride_y,
@@ -143,18 +166,7 @@ int I420ToARGB(const uint8_t* src_y,
                int width,
                int height);
 
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
-
+// Convert I420 to ABGR.
 LIBYUV_API
 int I420ToABGR(const uint8_t* src_y,
                int src_stride_y,
@@ -167,181 +179,6 @@ int I420ToABGR(const uint8_t* src_y,
                int width,
                int height);
 
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
-
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
-                int src_stride_y,
-                const uint8_t* src_u,
-                int src_stride_u,
-                const uint8_t* src_v,
-                int src_stride_v,
-                uint8_t* dst_rgb24,
-                int dst_stride_rgb24,
-                int width,
-                int height);
-
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
-              int src_stride_y,
-              const uint8_t* src_u,
-              int src_stride_u,
-              const uint8_t* src_v,
-              int src_stride_v,
-              uint8_t* dst_raw,
-              int dst_stride_raw,
-              int width,
-              int height);
-
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_u,
-                 int src_stride_u,
-                 const uint8_t* src_v,
-                 int src_stride_v,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
-
-// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 7 recommended.
-// The order of the dither matrix is first byte is upper left.
-
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
-                       int src_stride_y,
-                       const uint8_t* src_u,
-                       int src_stride_u,
-                       const uint8_t* src_v,
-                       int src_stride_v,
-                       uint8_t* dst_rgb565,
-                       int dst_stride_rgb565,
-                       const uint8_t* dither4x4,
-                       int width,
-                       int height);
-
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb1555,
-                   int dst_stride_argb1555,
-                   int width,
-                   int height);
-
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
-                   int src_stride_y,
-                   const uint8_t* src_u,
-                   int src_stride_u,
-                   const uint8_t* src_v,
-                   int src_stride_v,
-                   uint8_t* dst_argb4444,
-                   int dst_stride_argb4444,
-                   int width,
-                   int height);
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_ar30,
-               int dst_stride_ar30,
-               int width,
-               int height);
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/files/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index cbbef6fe..ff2a581a 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
                int width,
                int height);
 
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
 // Convert ARGB To RGB24.
 LIBYUV_API
 int ARGBToRGB24(const uint8_t* src_argb,
@@ -149,6 +153,30 @@ int ARGBToI444(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height);
+
+// Convert ABGR to AB64.
+#define ABGRToAB64 ARGBToAR64
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height);
+
+// Convert ABGR to AR64.
+#define ABGRToAR64 ARGBToAB64
+
 // Convert ARGB To I422.
 LIBYUV_API
 int ARGBToI422(const uint8_t* src_argb,
@@ -181,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -194,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height);
 
@@ -210,6 +238,50 @@ int ARGBToJ400(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J422.
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height);
+
+// Convert ABGR to J400. (JPeg full range).
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
+
 // Convert ARGB to I400.
 LIBYUV_API
 int ARGBToI400(const uint8_t* src_argb,
@@ -250,25 +322,25 @@ int ARGBToNV21(const uint8_t* src_argb,
                int width,
                int height);
 
-// Convert ARGB To NV21.
+// Convert ABGR To NV12.
 LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
-               int src_stride_argb,
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
                uint8_t* dst_y,
                int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
                int width,
                int height);
 
-// Convert ABGR To NV12.
+// Convert ABGR To NV21.
 LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
+int ABGRToNV21(const uint8_t* src_abgr,
                int src_stride_abgr,
                uint8_t* dst_y,
                int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
                int width,
                int height);
 
@@ -290,6 +362,17 @@ int ARGBToUYVY(const uint8_t* src_argb,
                int width,
                int height);
 
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index b01cd25c..5a81e7c9 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -31,24 +31,35 @@ static const int kCpuHasX86 = 0x10;
 static const int kCpuHasSSE2 = 0x20;
 static const int kCpuHasSSSE3 = 0x40;
 static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;  // unused at this time.
+static const int kCpuHasSSE42 = 0x100;
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
 static const int kCpuHasF16C = 0x2000;
-static const int kCpuHasGFNI = 0x4000;
-static const int kCpuHasAVX512BW = 0x8000;
-static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512BW = 0x4000;
+static const int kCpuHasAVX512VL = 0x8000;
+static const int kCpuHasAVX512VNNI = 0x10000;
 static const int kCpuHasAVX512VBMI = 0x20000;
 static const int kCpuHasAVX512VBMI2 = 0x40000;
 static const int kCpuHasAVX512VBITALG = 0x80000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
+static const int kCpuHasAVX10 = 0x100000;
+static const int kCpuHasAVXVNNI = 0x200000;
+static const int kCpuHasAVXVNNIINT8 = 0x400000;
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x200000;
-static const int kCpuHasMSA = 0x400000;
-static const int kCpuHasMMI = 0x800000;
+static const int kCpuHasMIPS = 0x800000;
+static const int kCpuHasMSA = 0x1000000;
+
+// These flags are only valid on LOONGARCH processors.
+static const int kCpuHasLOONGARCH = 0x2000000;
+static const int kCpuHasLSX = 0x4000000;
+static const int kCpuHasLASX = 0x8000000;
+
+// These flags are only valid on RISCV processors.
+static const int kCpuHasRISCV = 0x10000000;
+static const int kCpuHasRVV = 0x20000000;
+static const int kCpuHasRVVZVFH = 0x40000000;
 
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
@@ -71,6 +82,10 @@ static __inline int TestCpuFlag(int test_flag) {
 // Internal function for parsing /proc/cpuinfo.
 LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int RiscvCpuCaps(const char* cpuinfo_name);
 
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
diff --git a/include/libyuv/loongson_intrinsics.h b/include/libyuv/loongson_intrinsics.h
new file mode 100644
index 00000000..1d613def
--- /dev/null
+++ b/include/libyuv/loongson_intrinsics.h
@@ -0,0 +1,1949 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+#define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
+
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
+                                        __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
+                                         __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
+                                           __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
+                                        __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = {0};                                                     \
+    __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};               \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+#endif  // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
+                                             __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
+                                             __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
+                                           __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then this multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword)，added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
+                                          __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = {0};
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+  {                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                      \
+    int _i = 0;                                    \
+    if (enter)                                     \
+      printf("\nVP:");                             \
+    for (_i = 0; _i < element_num; _i++)           \
+      printf("%d,", _tmp0[_i]);                    \
+  }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */
diff --git a/files/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index 29997ce1..b9a44fcc 100644
--- a/files/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -81,25 +81,35 @@
   })
 #endif  // !(__mips == 64)
 #else   // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
-    uint32_t val_m;                                    \
-    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"    \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
-    val_m;                                             \
+#define LW(psrc)                                \
+  ({                                            \
+    uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
+    uint32_t val_lw_m;                          \
+                                                \
+    __asm__ volatile(                           \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
+                                                \
+        : [val_lw_m] "=&r"(val_lw_m)            \
+        : [psrc_lw_m] "r"(psrc_lw_m));          \
+                                                \
+    val_lw_m;                                   \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                       \
-  ({                                                   \
-    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
-    uint64_t val_m = 0;                                \
-    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"    \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
-    val_m;                                             \
+#define LD(psrc)                                \
+  ({                                            \
+    uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
+    uint64_t val_ld_m = 0;                      \
+                                                \
+    __asm__ volatile(                           \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
+                                                \
+        : [val_ld_m] "=&r"(val_ld_m)            \
+        : [psrc_ld_m] "r"(psrc_ld_m));          \
+                                                \
+    val_ld_m;                                   \
   })
 #else  // !(__mips == 64)
 #define LD(psrc)                                                         \
@@ -140,6 +150,9 @@
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
 #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
 
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
+
 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 
diff --git a/files/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h
index 275f8d4c..275f8d4c 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/include/libyuv/mjpeg_decoder.h
diff --git a/files/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index f6f5b3ed..f9344721 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -30,7 +30,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -83,6 +86,50 @@ void SetPlane(uint8_t* dst_y,
               int height,
               uint32_t value);
 
+// Convert a plane of tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height);
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height);
+
+// Convert a UV plane of tiles of 16 x H into linear U and V planes.
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+                        int src_stride_uv,
+                        uint8_t* dst_u,
+                        int dst_stride_u,
+                        uint8_t* dst_v,
+                        int dst_stride_v,
+                        int width,
+                        int height,
+                        int tile_height);
+
+// Convert a Y and UV plane of tiles into interlaced YUY2.
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height);
+
 // Split interleaved UV plane into separate U and V planes.
 LIBYUV_API
 void SplitUVPlane(const uint8_t* src_uv,
@@ -105,6 +152,72 @@ void MergeUVPlane(const uint8_t* src_u,
                   int width,
                   int height);
 
+// Split interleaved msb UV plane into separate lsb U and V planes.
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_u,
+                     int dst_stride_u,
+                     uint16_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     int depth);
+
+// Merge separate lsb U and V planes into one interleaved msb UV plane.
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height,
+                     int depth);
+
+// Convert lsb plane to msb plane
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+
+// Convert msb plane to lsb plane
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height);
+
 // Split interleaved RGB plane into separate R, G and B planes.
 LIBYUV_API
 void SplitRGBPlane(const uint8_t* src_rgb,
@@ -131,6 +244,92 @@ void MergeRGBPlane(const uint8_t* src_r,
                    int width,
                    int height);
 
+// Split interleaved ARGB plane into separate R, G, B and A planes.
+// dst_a can be NULL to discard alpha plane.
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_r,
+                    int dst_stride_r,
+                    uint8_t* dst_g,
+                    int dst_stride_g,
+                    uint8_t* dst_b,
+                    int dst_stride_b,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height);
+
+// Merge separate R, G, B and A planes into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+                    int src_stride_r,
+                    const uint8_t* src_g,
+                    int src_stride_g,
+                    const uint8_t* src_b,
+                    int src_stride_b,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
+
+// Merge separate 'depth' bit R, G and B planes stored in lsb
+// into one interleaved XR30 plane.
+// depth should in range [10, 16]
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    uint8_t* dst_ar30,
+                    int dst_stride_ar30,
+                    int width,
+                    int height,
+                    int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved AR64 plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [1, 16]
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    const uint16_t* src_a,
+                    int src_stride_a,
+                    uint16_t* dst_ar64,
+                    int dst_stride_ar64,
+                    int width,
+                    int height,
+                    int depth);
+
+// Merge separate 'depth' bit R, G, B and A planes stored in lsb
+// into one interleaved ARGB plane.
+// src_a can be NULL to fill opaque value to alpha.
+// depth should in range [8, 16]
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+                         int src_stride_r,
+                         const uint16_t* src_g,
+                         int src_stride_g,
+                         const uint16_t* src_b,
+                         int src_stride_b,
+                         const uint16_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height,
+                         int depth);
+
 // Copy I400.  Supports inverting.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
@@ -178,6 +377,68 @@ int I444Copy(const uint8_t* src_y,
              int width,
              int height);
 
+// Copy I210 to I210.
+#define I210ToI210 I210Copy
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy I410 to I410.
+#define I410ToI410 I410Copy
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy NV12. Supports inverting.
+LIBYUV_API
+int NV12Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_uv,
+             int src_stride_uv,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_uv,
+             int dst_stride_uv,
+             int width,
+             int height);
+
+// Copy NV21. Supports inverting.
+LIBYUV_API
+int NV21Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_vu,
+             int src_stride_vu,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_vu,
+             int dst_stride_vu,
+             int width,
+             int height);
+
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
@@ -245,6 +506,14 @@ int YUY2ToY(const uint8_t* src_yuy2,
             int width,
             int height);
 
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+            int src_stride_uyvy,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
+
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8_t* src_y,
@@ -293,6 +562,22 @@ int I400Mirror(const uint8_t* src_y,
                int height);
 
 // Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Alias
 #define ARGBToARGBMirror ARGBMirror
 
 // ARGB mirror.
@@ -304,56 +589,35 @@ int ARGBMirror(const uint8_t* src_argb,
                int width,
                int height);
 
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height);
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
 
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
+// RGB24 mirror.
 LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height);
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
-// Convert I422 to ABGR.
+// Mirror a plane of data.
 LIBYUV_API
-int I422ToABGR(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_abgr,
-               int dst_stride_abgr,
-               int width,
-               int height);
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
 
-// Convert I422 to RGBA.
+// Mirror a plane of UV data.
 LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height);
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height);
 
 // Alias
 #define RGB24ToRAW RAWToRGB24
@@ -566,15 +830,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
                      int width,
                      int height);
 
-typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
-                             const uint8_t* src_argb1,
-                             uint8_t* dst_argb,
-                             int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
 // Alpha Blend ARGB images and store to destination.
 // Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
@@ -734,6 +989,19 @@ int ARGBBlur(const uint8_t* src_argb,
              int height,
              int radius);
 
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+
 // Multiply ARGB image by ARGB value.
 LIBYUV_API
 int ARGBShade(const uint8_t* src_argb,
@@ -759,6 +1027,21 @@ int InterpolatePlane(const uint8_t* src0,
                      int height,
                      int interpolation);
 
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+                        int src_stride0,  // measured in 16 bit pixels
+                        const uint16_t* src1,
+                        int src_stride1,
+                        uint16_t* dst,
+                        int dst_stride,
+                        int width,
+                        int height,
+                        int interpolation);
+
 // Interpolate between two ARGB images using specified amount of interpolation
 // Internally calls InterpolatePlane with width * 4 (bpp).
 LIBYUV_API
@@ -815,7 +1098,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width);
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
-// shuffler is 16 bytes and must be aligned.
+// shuffler is 16 bytes.
 LIBYUV_API
 int ARGBShuffle(const uint8_t* src_bgra,
                 int src_stride_bgra,
@@ -825,6 +1108,17 @@ int ARGBShuffle(const uint8_t* src_bgra,
                 int width,
                 int height);
 
+// Shuffle AR64 channel order.  e.g. AR64 to AB64.
+// shuffler is 16 bytes.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+                int src_stride_ar64,
+                uint16_t* dst_ar64,
+                int dst_stride_ar64,
+                const uint8_t* shuffler,
+                int width,
+                int height);
+
 // Sobel ARGB effect with planar output.
 LIBYUV_API
 int ARGBSobelToPlane(const uint8_t* src_argb,
diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h
new file mode 100644
index 00000000..37460c4a
--- /dev/null
+++ b/include/libyuv/rotate.h
@@ -0,0 +1,296 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,      // No rotation.
+  kRotate90 = 90,    // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I422 frame.
+LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I010 frame.
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I210 frame.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate I410 frame.
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode);
+
+// Convert Android420 to I420 with rotation.
+// "rotation" can be 0, 90, 180 or 270.
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           uint8_t* dst_u,
+                           int dst_stride_u,
+                           uint8_t* dst_v,
+                           int dst_stride_v,
+                           int width,
+                           int height,
+                           enum RotationMode rotation);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode);
+
+// Rotations for when U and V are interleaved.
+// These functions take one UV input pointer and
+// split the data into two buffers while
+// rotating them.
+// width and height expected to be half size for NV12.
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height,
+                  enum RotationMode mode);
+
+LIBYUV_API
+void SplitRotateUV90(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     uint8_t* dst_b,
+                     int dst_stride_b,
+                     int width,
+                     int height);
+
+LIBYUV_API
+void SplitRotateUV180(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+
+LIBYUV_API
+void SplitRotateUV270(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+LIBYUV_API
+void SplitTransposeUV(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/files/include/libyuv/rotate_argb.h b/include/libyuv/rotate_argb.h
index 20432949..20432949 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/include/libyuv/rotate_argb.h
diff --git a/files/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h
index 022293ee..3e6a2fef 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@@ -28,12 +28,16 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// The following are available for Visual C 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+    !defined(__clang__)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
@@ -41,6 +45,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
 #endif
 
 // The following are available for 64 bit GCC:
@@ -53,6 +59,7 @@ extern "C" {
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
+#define HAS_TRANSPOSE4X4_32_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -60,9 +67,9 @@ extern "C" {
 #define HAS_TRANSPOSEUVWX16_MSA
 #endif
 
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_TRANSPOSEWX8_MMI
-#define HAS_TRANSPOSEUVWX8_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_TRANSPOSEWX16_LSX
+#define HAS_TRANSPOSEUVWX16_LSX
 #endif
 
 void TransposeWxH_C(const uint8_t* src,
@@ -92,11 +99,6 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                         uint8_t* dst,
                         int dst_stride,
                         int width);
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width);
 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                              int src_stride,
                              uint8_t* dst,
@@ -107,6 +109,11 @@ void TransposeWx16_MSA(const uint8_t* src,
                        uint8_t* dst,
                        int dst_stride,
                        int width);
+void TransposeWx16_LSX(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
 
 void TransposeWx8_Any_NEON(const uint8_t* src,
                            int src_stride,
@@ -118,11 +125,6 @@ void TransposeWx8_Any_SSSE3(const uint8_t* src,
                             uint8_t* dst,
                             int dst_stride,
                             int width);
-void TransposeWx8_Any_MMI(const uint8_t* src,
-                          int src_stride,
-                          uint8_t* dst,
-                          int dst_stride,
-                          int width);
 void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
                                  int src_stride,
                                  uint8_t* dst,
@@ -133,6 +135,11 @@ void TransposeWx16_Any_MSA(const uint8_t* src,
                            uint8_t* dst,
                            int dst_stride,
                            int width);
+void TransposeWx16_Any_LSX(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
 
 void TransposeUVWxH_C(const uint8_t* src,
                       int src_stride,
@@ -171,13 +178,6 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width);
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width);
 void TransposeUVWx16_MSA(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst_a,
@@ -185,6 +185,13 @@ void TransposeUVWx16_MSA(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width);
+void TransposeUVWx16_LSX(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
 
 void TransposeUVWx8_Any_SSE2(const uint8_t* src,
                              int src_stride,
@@ -200,13 +207,6 @@ void TransposeUVWx8_Any_NEON(const uint8_t* src,
                              uint8_t* dst_b,
                              int dst_stride_b,
                              int width);
-void TransposeUVWx8_Any_MMI(const uint8_t* src,
-                            int src_stride,
-                            uint8_t* dst_a,
-                            int dst_stride_a,
-                            uint8_t* dst_b,
-                            int dst_stride_b,
-                            int width);
 void TransposeUVWx16_Any_MSA(const uint8_t* src,
                              int src_stride,
                              uint8_t* dst_a,
@@ -214,6 +214,55 @@ void TransposeUVWx16_Any_MSA(const uint8_t* src,
                              uint8_t* dst_b,
                              int dst_stride_b,
                              int width);
+void TransposeUVWx16_Any_LSX(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height);
+
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx1_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/files/include/libyuv/row.h b/include/libyuv/row.h
index 9bb48850..46685a50 100644
--- a/files/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -11,7 +11,8 @@
 #ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
-#include <stdlib.h>  // For malloc.
+#include <stddef.h>  // For NULL
+#include <stdlib.h>  // For malloc
 
 #include "libyuv/basic_types.h"
 
@@ -30,7 +31,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -74,7 +78,6 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 // Conversions:
-#define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
@@ -87,18 +90,13 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565DITHERROW_SSE2
 #define HAS_ARGBTORGB565ROW_SSE2
-#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
 #define HAS_HALFFLOATROW_SSE2
-#define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
@@ -108,11 +106,13 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_I444TORGB24ROW_SSSE3
+#define HAS_INTERPOLATEROW_SSSE3
 #define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORUVROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV12TORGB24ROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
@@ -120,11 +120,12 @@ extern "C" {
 #define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
 #define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
-#define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
 #define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
@@ -137,11 +138,18 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
@@ -156,11 +164,9 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_BLENDPLANEROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
 #define HAS_RGBCOLORTABLEROW_X86
 #define HAS_SOBELROW_SSE2
 #define HAS_SOBELTOPLANEROW_SSE2
@@ -175,6 +181,7 @@ extern "C" {
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I444ALPHATOARGBROW_SSSE3
 #endif
 #endif
 
@@ -190,15 +197,11 @@ extern "C" {
 #define HAS_ARGBPOLYNOMIALROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTOUVJROW_AVX2
-#define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
-//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
-#define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -206,6 +209,7 @@ extern "C" {
 #define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
+#define HAS_I444TORGB24ROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
@@ -215,6 +219,8 @@ extern "C" {
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_NV21TORGB24ROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -224,13 +230,16 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test half float cast
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#endif
 
 // Effects:
 #define HAS_ARGBADDROW_AVX2
-#define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
-#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
 
 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
@@ -238,13 +247,14 @@ extern "C" {
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_AVX2
+#define HAS_I444ALPHATOARGBROW_AVX2
 #endif
 #endif
 
-// The following are available for AVX2 Visual C and clangcl 32 bit:
+// The following are available for AVX2 Visual C 32 bit:
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+    !defined(__clang__) && defined(VISUALC_HAS_AVX2)
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
 #define HAS_ARGBTOARGB1555ROW_AVX2
@@ -257,62 +267,162 @@ extern "C" {
 // The following are also available on x64 Visual C.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
     (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I444ALPHATOARGBROW_SSSE3
+#define HAS_I444TOARGBROW_SSSE3
 #define HAS_I422ALPHATOARGBROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
 #endif
 
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ABGRTOYJROW_SSSE3
+#define HAS_AR64TOARGBROW_SSSE3
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBTOAB64ROW_SSSE3
 #define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR64ROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
-// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_DETILEROW_16_SSE2
+#define HAS_DETILEROW_SSE2
+#define HAS_DETILESPLITUVROW_SSSE3
+#define HAS_DETILETOYUY2_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
+#define HAS_I212TOAR30ROW_SSSE3
+#define HAS_I212TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I410TOAR30ROW_SSSE3
+#define HAS_I410TOARGBROW_SSSE3
 #define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGEARGBROW_SSE2
 #define HAS_MERGERGBROW_SSSE3
+#define HAS_MERGEXRGBROW_SSE2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV21TOYUV24ROW_SSSE3
+#define HAS_P210TOAR30ROW_SSSE3
+#define HAS_P210TOARGBROW_SSSE3
+#define HAS_P410TOAR30ROW_SSSE3
+#define HAS_P410TOARGBROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
+#define HAS_SPLITARGBROW_SSE2
+#define HAS_SPLITARGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
+#define HAS_SPLITXRGBROW_SSE2
+#define HAS_SPLITXRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
+#define HAS_YUY2TONVUVROW_SSE2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_SSSE3
+#endif
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_SSSE3
+#define HAS_I410ALPHATOARGBROW_SSSE3
+#endif
 #endif
 
 // The following are available for AVX2 gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+#if !defined(LIBYUV_DISABLE_X86) &&               \
+    (defined(__x86_64__) || defined(__i386__)) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_AB64TOARGBROW_AVX2
 #define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOYJROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
+#define HAS_AR64TOARGBROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBTOAB64ROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTOAR64ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_DETILEROW_16_AVX
+#define HAS_DIVIDEROW_16_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
+#define HAS_I212TOAR30ROW_AVX2
+#define HAS_I212TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I410TOAR30ROW_AVX2
+#define HAS_I410TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
+#define HAS_INTERPOLATEROW_16TO8_AVX2
+#define HAS_MERGEAR64ROW_AVX2
+#define HAS_MERGEARGB16TO8ROW_AVX2
+#define HAS_MERGEARGBROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
+#define HAS_MERGEXR30ROW_AVX2
+#define HAS_MERGEXR64ROW_AVX2
+#define HAS_MERGEXRGB16TO8ROW_AVX2
+#define HAS_MERGEXRGBROW_AVX2
+#define HAS_MIRRORUVROW_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
-// TODO(fbarchard): Fix AVX2 version of YUV24
-// #define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_NV21TOYUV24ROW_AVX2
+#define HAS_P210TOAR30ROW_AVX2
+#define HAS_P210TOARGBROW_AVX2
+#define HAS_P410TOAR30ROW_AVX2
+#define HAS_P410TOARGBROW_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SPLITARGBROW_AVX2
+#define HAS_SPLITUVROW_16_AVX2
+#define HAS_SPLITXRGBROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+#define HAS_YUY2TONVUVROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVJROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#endif
+
+#if defined(__x86_64__) || !defined(__pic__)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I210ALPHATOARGBROW_AVX2
+#define HAS_I410ALPHATOARGBROW_AVX2
+#endif
 #endif
 
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
-#if !defined(LIBYUV_DISABLE_X86) &&                                       \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
-    (defined(CLANG_HAS_AVX512))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_MERGEUVROW_AVX512BW
+#endif
+
+// The following are available for AVX512 clang x64 platforms:
+// TODO(fbarchard): Port to x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
+    (defined(CLANG_HAS_AVX512))
+#define HAS_I422TOARGBROW_AVX512BW
 #endif
 
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_AB64TOARGBROW_NEON
+#define HAS_ABGRTOUVJROW_NEON
 #define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYJROW_NEON
 #define HAS_ABGRTOYROW_NEON
+#define HAS_AR64TOARGBROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
 #define HAS_ARGB1555TOUVROW_NEON
 #define HAS_ARGB1555TOYROW_NEON
@@ -321,6 +431,8 @@ extern "C" {
 #define HAS_ARGB4444TOYROW_NEON
 #define HAS_ARGBEXTRACTALPHAROW_NEON
 #define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOAB64ROW_NEON
+#define HAS_ARGBTOAR64ROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTORAWROW_NEON
@@ -338,8 +450,16 @@ extern "C" {
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
+#define HAS_CONVERT16TO8ROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_DETILEROW_16_NEON
+#define HAS_DETILEROW_NEON
+#define HAS_DETILESPLITUVROW_NEON
+#define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
+#define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
 #define HAS_I400TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@@ -350,11 +470,25 @@ extern "C" {
 #define HAS_I422TORGBAROW_NEON
 #define HAS_I422TOUYVYROW_NEON
 #define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444ALPHATOARGBROW_NEON
 #define HAS_I444TOARGBROW_NEON
+#define HAS_I444TORGB24ROW_NEON
+#define HAS_INTERPOLATEROW_16_NEON
+#define HAS_INTERPOLATEROW_NEON
 #define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEAR64ROW_NEON
+#define HAS_MERGEARGB16TO8ROW_NEON
+#define HAS_MERGEARGBROW_NEON
+#define HAS_MERGEUVROW_16_NEON
 #define HAS_MERGEUVROW_NEON
+#define HAS_MERGEXR30ROW_NEON
+#define HAS_MERGEXR64ROW_NEON
+#define HAS_MERGEXRGB16TO8ROW_NEON
+#define HAS_MERGEXRGBROW_NEON
 #define HAS_MIRRORROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
 #define HAS_MIRRORUVROW_NEON
+#define HAS_MULTIPLYROW_16_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
@@ -363,25 +497,35 @@ extern "C" {
 #define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
+#define HAS_RAWTOUVJROW_NEON
 #define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
 #define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVJROW_NEON
 #define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
 #define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
 #define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_SPLITARGBROW_NEON
 #define HAS_SPLITRGBROW_NEON
+#define HAS_SPLITUVROW_16_NEON
 #define HAS_SPLITUVROW_NEON
-#define HAS_UVToVUROW_NEON
+#define HAS_SPLITXRGBROW_NEON
+#define HAS_SWAPUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TONVUVROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
@@ -399,7 +543,7 @@ extern "C" {
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSHUFFLEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_INTERPOLATEROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
 #define HAS_SOBELROW_NEON
 #define HAS_SOBELTOPLANEROW_NEON
 #define HAS_SOBELXROW_NEON
@@ -409,10 +553,13 @@ extern "C" {
 
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#define HAS_FLOATDIVTOBYTEROW_NEON
+#define HAS_GAUSSCOL_F32_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_INTERPOLATEROW_16TO8_NEON
 #define HAS_SCALESUMSAMPLES_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVJROW_MSA
 #define HAS_ABGRTOUVROW_MSA
 #define HAS_ABGRTOYROW_MSA
 #define HAS_ARGB1555TOARGBROW_MSA
@@ -449,8 +596,11 @@ extern "C" {
 #define HAS_HALFFLOATROW_MSA
 #define HAS_I400TOARGBROW_MSA
 #define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TOARGB4444ROW_MSA
 #define HAS_I422TOARGBROW_MSA
 #define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
 #define HAS_I422TORGBAROW_MSA
 #define HAS_I422TOUYVYROW_MSA
 #define HAS_I422TOYUY2ROW_MSA
@@ -459,6 +609,7 @@ extern "C" {
 #define HAS_J400TOARGBROW_MSA
 #define HAS_MERGEUVROW_MSA
 #define HAS_MIRRORROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
 #define HAS_MIRRORUVROW_MSA
 #define HAS_NV12TOARGBROW_MSA
 #define HAS_NV12TORGB565ROW_MSA
@@ -491,79 +642,217 @@ extern "C" {
 #define HAS_YUY2TOYROW_MSA
 #endif
 
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_ABGRTOUVROW_MMI
-#define HAS_ABGRTOYROW_MMI
-#define HAS_ARGB1555TOARGBROW_MMI
-#define HAS_ARGB1555TOUVROW_MMI
-#define HAS_ARGB1555TOYROW_MMI
-#define HAS_ARGB4444TOARGBROW_MMI
-#define HAS_ARGB4444TOUVROW_MMI
-#define HAS_ARGB4444TOYROW_MMI
-#define HAS_ARGBADDROW_MMI
-#define HAS_ARGBATTENUATEROW_MMI
-#define HAS_ARGBBLENDROW_MMI
-#define HAS_ARGBCOLORMATRIXROW_MMI
-#define HAS_ARGBCOPYALPHAROW_MMI
-#define HAS_ARGBCOPYYTOALPHAROW_MMI
-#define HAS_ARGBEXTRACTALPHAROW_MMI
-#define HAS_ARGBGRAYROW_MMI
-#define HAS_ARGBMIRRORROW_MMI
-#define HAS_ARGBMULTIPLYROW_MMI
-#define HAS_ARGBSEPIAROW_MMI
-#define HAS_ARGBSHADEROW_MMI
-#define HAS_ARGBSHUFFLEROW_MMI
-#define HAS_ARGBSUBTRACTROW_MMI
-#define HAS_ARGBTOARGB1555ROW_MMI
-#define HAS_ARGBTOARGB4444ROW_MMI
-#define HAS_ARGBTORAWROW_MMI
-#define HAS_ARGBTORGB24ROW_MMI
-#define HAS_ARGBTORGB565DITHERROW_MMI
-#define HAS_ARGBTORGB565ROW_MMI
-#define HAS_ARGBTOUV444ROW_MMI
-#define HAS_ARGBTOUVJROW_MMI
-#define HAS_ARGBTOUVROW_MMI
-#define HAS_ARGBTOYJROW_MMI
-#define HAS_ARGBTOYROW_MMI
-#define HAS_BGRATOUVROW_MMI
-#define HAS_BGRATOYROW_MMI
-#define HAS_BLENDPLANEROW_MMI
-#define HAS_COMPUTECUMULATIVESUMROW_MMI
-#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
-#define HAS_HALFFLOATROW_MMI
-#define HAS_I400TOARGBROW_MMI
-#define HAS_I422TOUYVYROW_MMI
-#define HAS_I422TOYUY2ROW_MMI
-#define HAS_INTERPOLATEROW_MMI
-#define HAS_J400TOARGBROW_MMI
-#define HAS_MERGERGBROW_MMI
-#define HAS_MERGEUVROW_MMI
-#define HAS_MIRRORROW_MMI
-#define HAS_MIRRORUVROW_MMI
-#define HAS_RAWTOARGBROW_MMI
-#define HAS_RAWTORGB24ROW_MMI
-#define HAS_RAWTOUVROW_MMI
-#define HAS_RAWTOYROW_MMI
-#define HAS_RGB24TOARGBROW_MMI
-#define HAS_RGB24TOUVROW_MMI
-#define HAS_RGB24TOYROW_MMI
-#define HAS_RGB565TOARGBROW_MMI
-#define HAS_RGB565TOUVROW_MMI
-#define HAS_RGB565TOYROW_MMI
-#define HAS_RGBATOUVROW_MMI
-#define HAS_RGBATOYROW_MMI
-#define HAS_SOBELROW_MMI
-#define HAS_SOBELTOPLANEROW_MMI
-#define HAS_SOBELXROW_MMI
-#define HAS_SOBELXYROW_MMI
-#define HAS_SOBELYROW_MMI
-#define HAS_SPLITRGBROW_MMI
-#define HAS_SPLITUVROW_MMI
-#define HAS_UYVYTOUVROW_MMI
-#define HAS_UYVYTOYROW_MMI
-#define HAS_YUY2TOUV422ROW_MMI
-#define HAS_YUY2TOUVROW_MMI
-#define HAS_YUY2TOYROW_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_ABGRTOUVROW_LSX
+#define HAS_ABGRTOYROW_LSX
+#define HAS_ARGB1555TOARGBROW_LSX
+#define HAS_ARGB1555TOUVROW_LSX
+#define HAS_ARGB1555TOYROW_LSX
+#define HAS_ARGB4444TOARGBROW_LSX
+#define HAS_ARGBADDROW_LSX
+#define HAS_ARGBATTENUATEROW_LSX
+#define HAS_ARGBBLENDROW_LSX
+#define HAS_ARGBCOLORMATRIXROW_LSX
+#define HAS_ARGBEXTRACTALPHAROW_LSX
+#define HAS_ARGBGRAYROW_LSX
+#define HAS_ARGBSEPIAROW_LSX
+#define HAS_ARGBSHADEROW_LSX
+#define HAS_ARGBSHUFFLEROW_LSX
+#define HAS_ARGBSUBTRACTROW_LSX
+#define HAS_ARGBQUANTIZEROW_LSX
+#define HAS_ARGBSETROW_LSX
+#define HAS_ARGBTOARGB1555ROW_LSX
+#define HAS_ARGBTOARGB4444ROW_LSX
+#define HAS_ARGBTORAWROW_LSX
+#define HAS_ARGBTORGB24ROW_LSX
+#define HAS_ARGBTORGB565ROW_LSX
+#define HAS_ARGBTORGB565DITHERROW_LSX
+#define HAS_ARGBTOUVJROW_LSX
+#define HAS_ARGBTOUV444ROW_LSX
+#define HAS_ARGBTOUVROW_LSX
+#define HAS_ARGBTOYJROW_LSX
+#define HAS_ARGBMIRRORROW_LSX
+#define HAS_ARGBMULTIPLYROW_LSX
+#define HAS_BGRATOUVROW_LSX
+#define HAS_BGRATOYROW_LSX
+#define HAS_I400TOARGBROW_LSX
+#define HAS_I444TOARGBROW_LSX
+#define HAS_INTERPOLATEROW_LSX
+#define HAS_I422ALPHATOARGBROW_LSX
+#define HAS_I422TOARGB1555ROW_LSX
+#define HAS_I422TOARGB4444ROW_LSX
+#define HAS_I422TORGB24ROW_LSX
+#define HAS_I422TORGB565ROW_LSX
+#define HAS_I422TORGBAROW_LSX
+#define HAS_I422TOUYVYROW_LSX
+#define HAS_I422TOYUY2ROW_LSX
+#define HAS_J400TOARGBROW_LSX
+#define HAS_MERGEUVROW_LSX
+#define HAS_MIRRORROW_LSX
+#define HAS_MIRRORUVROW_LSX
+#define HAS_MIRRORSPLITUVROW_LSX
+#define HAS_NV12TOARGBROW_LSX
+#define HAS_NV12TORGB565ROW_LSX
+#define HAS_NV21TOARGBROW_LSX
+#define HAS_RAWTOARGBROW_LSX
+#define HAS_RAWTORGB24ROW_LSX
+#define HAS_RAWTOUVROW_LSX
+#define HAS_RAWTOYROW_LSX
+#define HAS_RGB24TOARGBROW_LSX
+#define HAS_RGB24TOUVROW_LSX
+#define HAS_RGB24TOYROW_LSX
+#define HAS_RGB565TOARGBROW_LSX
+#define HAS_RGB565TOUVROW_LSX
+#define HAS_RGB565TOYROW_LSX
+#define HAS_RGBATOUVROW_LSX
+#define HAS_RGBATOYROW_LSX
+#define HAS_SETROW_LSX
+#define HAS_SOBELROW_LSX
+#define HAS_SOBELTOPLANEROW_LSX
+#define HAS_SOBELXYROW_LSX
+#define HAS_SPLITUVROW_LSX
+#define HAS_UYVYTOARGBROW_LSX
+#define HAS_UYVYTOUV422ROW_LSX
+#define HAS_UYVYTOUVROW_LSX
+#define HAS_UYVYTOYROW_LSX
+#define HAS_YUY2TOARGBROW_LSX
+#define HAS_YUY2TOUVROW_LSX
+#define HAS_YUY2TOUV422ROW_LSX
+#define HAS_YUY2TOYROW_LSX
+#define HAS_ARGBTOYROW_LSX
+#define HAS_ABGRTOYJROW_LSX
+#define HAS_RGBATOYJROW_LSX
+#define HAS_RGB24TOYJROW_LSX
+#define HAS_RAWTOYJROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_I422TOARGBROW_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#define HAS_ARGB1555TOARGBROW_LASX
+#define HAS_ARGB1555TOUVROW_LASX
+#define HAS_ARGB1555TOYROW_LASX
+#define HAS_ARGB4444TOARGBROW_LASX
+#define HAS_ARGBADDROW_LASX
+#define HAS_ARGBATTENUATEROW_LASX
+#define HAS_ARGBGRAYROW_LASX
+#define HAS_ARGBMIRRORROW_LASX
+#define HAS_ARGBMULTIPLYROW_LASX
+#define HAS_ARGBSEPIAROW_LASX
+#define HAS_ARGBSHADEROW_LASX
+#define HAS_ARGBSHUFFLEROW_LASX
+#define HAS_ARGBSUBTRACTROW_LASX
+#define HAS_ARGBTOARGB1555ROW_LASX
+#define HAS_ARGBTOARGB4444ROW_LASX
+#define HAS_ARGBTORAWROW_LASX
+#define HAS_ARGBTORGB24ROW_LASX
+#define HAS_ARGBTORGB565DITHERROW_LASX
+#define HAS_ARGBTORGB565ROW_LASX
+#define HAS_ARGBTOUV444ROW_LASX
+#define HAS_ARGBTOUVJROW_LASX
+#define HAS_ARGBTOUVROW_LASX
+#define HAS_ARGBTOYJROW_LASX
+#define HAS_ARGBTOYROW_LASX
+#define HAS_ABGRTOYJROW_LASX
+#define HAS_ABGRTOYROW_LASX
+#define HAS_I422ALPHATOARGBROW_LASX
+#define HAS_I422TOARGB1555ROW_LASX
+#define HAS_I422TOARGB4444ROW_LASX
+#define HAS_I422TOARGBROW_LASX
+#define HAS_I422TORGB24ROW_LASX
+#define HAS_I422TORGB565ROW_LASX
+#define HAS_I422TORGBAROW_LASX
+#define HAS_I422TOUYVYROW_LASX
+#define HAS_I422TOYUY2ROW_LASX
+#define HAS_MIRRORROW_LASX
+#define HAS_MIRRORUVROW_LASX
+#define HAS_NV12TOARGBROW_LASX
+#define HAS_NV12TORGB565ROW_LASX
+#define HAS_NV21TOARGBROW_LASX
+#define HAS_RAWTOARGBROW_LASX
+#define HAS_RAWTOUVROW_LASX
+#define HAS_RAWTOYROW_LASX
+#define HAS_RGB24TOARGBROW_LASX
+#define HAS_RGB24TOUVROW_LASX
+#define HAS_RGB24TOYROW_LASX
+#define HAS_RGB565TOARGBROW_LASX
+#define HAS_RGB565TOUVROW_LASX
+#define HAS_RGB565TOYROW_LASX
+#define HAS_UYVYTOUV422ROW_LASX
+#define HAS_UYVYTOUVROW_LASX
+#define HAS_UYVYTOYROW_LASX
+#define HAS_YUY2TOUV422ROW_LASX
+#define HAS_YUY2TOUVROW_LASX
+#define HAS_YUY2TOYROW_LASX
+#define HAS_RGBATOYROW_LASX
+#define HAS_RGBATOYJROW_LASX
+#define HAS_BGRATOYROW_LASX
+#define HAS_RGB24TOYJROW_LASX
+#define HAS_RAWTOYJROW_LASX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_COPYROW_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_AB64TOARGBROW_RVV
+#define HAS_ABGRTOYJROW_RVV
+#define HAS_ABGRTOYROW_RVV
+#define HAS_AR64TOARGBROW_RVV
+#define HAS_AR64TOAB64ROW_RVV
+#define HAS_ARGBATTENUATEROW_RVV
+#define HAS_ARGBBLENDROW_RVV
+#define HAS_ARGBCOPYYTOALPHAROW_RVV
+#define HAS_ARGBEXTRACTALPHAROW_RVV
+#define HAS_ARGBTOAB64ROW_RVV
+#define HAS_ARGBTOABGRROW_RVV
+#define HAS_ARGBTOAR64ROW_RVV
+#define HAS_ARGBTOBGRAROW_RVV
+#define HAS_ARGBTORAWROW_RVV
+#define HAS_ARGBTORGB24ROW_RVV
+#define HAS_ARGBTORGBAROW_RVV
+#define HAS_ARGBTOYJROW_RVV
+#define HAS_ARGBTOYMATRIXROW_RVV
+#define HAS_ARGBTOYROW_RVV
+#define HAS_BGRATOYROW_RVV
+#define HAS_BLENDPLANEROW_RVV
+#define HAS_I400TOARGBROW_RVV
+#define HAS_I422ALPHATOARGBROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
+#define HAS_I444ALPHATOARGBROW_RVV
+#define HAS_I444TOARGBROW_RVV
+#define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
+#define HAS_J400TOARGBROW_RVV
+#define HAS_MERGEARGBROW_RVV
+#define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
+#define HAS_MERGEXRGBROW_RVV
+#define HAS_NV12TOARGBROW_RVV
+#define HAS_NV12TORGB24ROW_RVV
+#define HAS_NV21TOARGBROW_RVV
+#define HAS_NV21TORGB24ROW_RVV
+#define HAS_RAWTOARGBROW_RVV
+#define HAS_RAWTORGB24ROW_RVV
+#define HAS_RAWTORGBAROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOARGBROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
+#define HAS_RGBATOARGBROW_RVV
+#define HAS_RGBATOYJROW_RVV
+#define HAS_RGBATOYMATRIXROW_RVV
+#define HAS_RGBATOYROW_RVV
+#define HAS_RGBTOYMATRIXROW_RVV
+#define HAS_SPLITARGBROW_RVV
+#define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
+#define HAS_SPLITXRGBROW_RVV
+#endif
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -572,8 +861,10 @@ extern "C" {
 #else
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #endif
+#define LIBYUV_NOINLINE __declspec(noinline)
 typedef __declspec(align(16)) int16_t vec16[8];
 typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
 typedef __declspec(align(16)) int8_t vec8[16];
 typedef __declspec(align(16)) uint16_t uvec16[8];
 typedef __declspec(align(16)) uint32_t uvec32[4];
@@ -591,8 +882,10 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #endif
+#define LIBYUV_NOINLINE __attribute__((noinline))
 typedef int16_t __attribute__((vector_size(16))) vec16;
 typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
 typedef int8_t __attribute__((vector_size(16))) vec8;
 typedef uint16_t __attribute__((vector_size(16))) uvec16;
 typedef uint32_t __attribute__((vector_size(16))) uvec32;
@@ -605,8 +898,10 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32;
 typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
+#define LIBYUV_NOINLINE
 typedef int16_t vec16[8];
 typedef int32_t vec32[4];
+typedef float vecf32[4];
 typedef int8_t vec8[16];
 typedef uint16_t uvec16[8];
 typedef uint32_t uvec32[4];
@@ -619,65 +914,40 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
 
-#if defined(__aarch64__)
-// This struct is for Arm64 color conversion.
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
-  uvec16 kUVToRB;
-  uvec16 kUVToRB2;
-  uvec16 kUVToG;
-  uvec16 kUVToG2;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
-};
-#elif defined(__arm__)
-// This struct is for ArmV7 color conversion.
-struct YuvConstants {
-  uvec8 kUVToRB;
-  uvec8 kUVToG;
-  vec16 kUVBiasBGR;
-  vec32 kYToRgb;
+  uvec8 kUVCoeff;
+  vec16 kRGBCoeffBias;
 };
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
-  int8_t kUVToB[32];
-  int8_t kUVToG[32];
-  int8_t kUVToR[32];
-  int16_t kUVBiasB[16];
-  int16_t kUVBiasG[16];
-  int16_t kUVBiasR[16];
+  uint8_t kUVToB[32];
+  uint8_t kUVToG[32];
+  uint8_t kUVToR[32];
   int16_t kYToRgb[16];
+  int16_t kYBiasToRgb[16];
 };
 
 // Offsets into YuvConstants structure
 #define KUVTOB 0
 #define KUVTOG 32
 #define KUVTOR 64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB 192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants);  // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants);  // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants);  // BT.709
+#define KYTORGB 96
+#define KYBIASTORGB 128
 
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants);  // BT.601
-extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants);  // JPeg
-extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
+#endif
 
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
-#define align_buffer_64(var, size)                                           \
-  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
-  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+#define align_buffer_64(var, size)                                         \
+  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
 
 #define free_aligned_buffer_64(var) \
   free(var##_mem);                  \
-  var = 0
+  var = NULL
 
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
@@ -749,12 +1019,25 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToARGBRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              const uint8_t* src_u,
                              const uint8_t* src_v,
@@ -762,12 +1045,6 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGBRow_NEON(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
 void I422ToRGBARow_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@@ -835,12 +1112,62 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I444ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 
 void I422ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
@@ -848,12 +1175,36 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGBARow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -861,30 +1212,92 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGB24Row_MSA(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I422ToRGB565Row_MSA(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
 void I422ToARGB4444Row_MSA(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToARGB1555Row_MSA(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void NV12ToARGBRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
@@ -909,23 +1322,107 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
                        const struct YuvConstants* yuvconstants,
                        int width);
 
+void NV12ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV12ToRGB565Row_LASX(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -939,25 +1436,39 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void ARGBToUVRow_MMI(const uint8_t* src_argb0,
+void ARGBToUVRow_LSX(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void ARGBToUVRow_LASX(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width);
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       int src_stride_bgra,
                       uint8_t* dst_u,
@@ -983,6 +1494,16 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+                        int src_stride_rgb24,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+                      int src_stride_raw,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         int src_stride_rgb565,
                         uint8_t* dst_u,
@@ -998,32 +1519,37 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVJRow_MSA(const uint8_t* src_rgb,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -1038,56 +1564,78 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
+void BGRAToUVRow_LSX(const uint8_t* src_bgra,
+                     int src_stride_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
+void ABGRToUVRow_LSX(const uint8_t* src_abgr,
+                     int src_stride_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
-                     int src_stride_rgb,
+void RGBAToUVRow_LSX(const uint8_t* src_rgba,
+                     int src_stride_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
-                      int src_stride_rgb,
+void ARGBToUVJRow_LSX(const uint8_t* src_argb,
+                      int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width);
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
+void ARGBToUVJRow_LASX(const uint8_t* src_argb,
+                       int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
                          int src_stride_argb1555,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
+void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
+                      int src_stride_rgb24,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RAWToUVRow_LSX(const uint8_t* src_raw,
+                    int src_stride_raw,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RAWToUVRow_LASX(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
@@ -1095,46 +1643,82 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-
-void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width);
+void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+
+void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
@@ -1153,88 +1737,136 @@ void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
-void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+
+void BGRAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        int src_stride_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        int src_stride_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
 void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
-                            int src_stride_ptr,
+                            int src_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
 void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
@@ -1243,7 +1875,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -1256,57 +1888,81 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width);
-void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
 void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
 void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                           int src_stride_ptr,
+                           int src_stride,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
 void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
-                         int src_stride_ptr,
+                         int src_stride,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void RGB24ToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                            int src_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void RAWToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                            int src_stride_ptr,
+                            int src_stride,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
 void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride_ptr,
+                              int src_stride,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
 void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                              int src_stride_ptr,
+                              int src_stride,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
@@ -1350,96 +2006,131 @@ void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
-void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
-                          int src_stride_ptr,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width);
-void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ABGRToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+void BGRAToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+void RGBAToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVJRow_Any_LSX(const uint8_t* src_ptr,
                           int src_stride_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
-void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
-                        int src_stride_ptr,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width);
-void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBToUVJRow_Any_LASX(const uint8_t* src_ptr,
                            int src_stride_ptr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width);
-void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
-                             int src_stride_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
-void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB1555ToUVRow_Any_LSX(const uint8_t* src_ptr,
                              int src_stride_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGB1555ToUVRow_Any_LASX(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void RGB565ToUVRow_Any_LSX(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RGB565ToUVRow_Any_LASX(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void RGB24ToUVRow_Any_LSX(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGB24ToUVRow_Any_LASX(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RAWToUVRow_Any_LSX(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RAWToUVRow_Any_LASX(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
-                   int src_stride_rgb,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ABGRToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb0,
+void BGRAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb0,
+void ABGRToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+void RGBAToUVRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGBAToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void RAWToUVRow_C(const uint8_t* src_rgb0,
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RAWToUVRow_C(const uint8_t* src_rgb,
                   int src_stride_rgb,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
                   int width);
+void RGB24ToUVJRow_C(const uint8_t* src_rgb,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RAWToUVJRow_C(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
 void RGB565ToUVRow_C(const uint8_t* src_rgb565,
                      int src_stride_rgb565,
                      uint8_t* dst_u,
@@ -1474,41 +2165,59 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
-void MirrorUVRow_SSSE3(const uint8_t* src,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width);
-void MirrorUVRow_NEON(const uint8_t* src_uv,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width);
-void MirrorUVRow_MSA(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void MirrorUVRow_MMI(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width);
-void MirrorUVRow_C(const uint8_t* src_uv,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width);
 
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -1520,7 +2229,24 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width);
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width);
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 
 void SplitUVRow_C(const uint8_t* src_uv,
                   uint8_t* dst_u,
@@ -1542,7 +2268,11 @@ void SplitUVRow_MSA(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void SplitUVRow_MMI(const uint8_t* src_uv,
+void SplitUVRow_LSX(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
@@ -1562,11 +2292,123 @@ void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+void SplitUVRow_Any_LSX(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-
+void DetileRow_C(const uint8_t* src,
+                 ptrdiff_t src_tile_stride,
+                 uint8_t* dst,
+                 int width);
+void DetileRow_NEON(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width);
+void DetileRow_Any_NEON(const uint8_t* src,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst,
+                        int width);
+void DetileRow_SSE2(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width);
+void DetileRow_Any_SSE2(const uint8_t* src,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst,
+                        int width);
+void DetileRow_AVX(const uint8_t* src,
+                   ptrdiff_t src_tile_stride,
+                   uint8_t* dst,
+                   int width);
+void DetileRow_Any_AVX(const uint8_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint8_t* dst,
+                       int width);
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width);
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_NEON(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width);
+void DetileRow_16_Any_SSE2(const uint16_t* src,
+                           ptrdiff_t src_tile_stride,
+                           uint16_t* dst,
+                           int width);
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width);
+void DetileRow_16_Any_AVX(const uint16_t* src,
+                          ptrdiff_t src_tile_stride,
+                          uint16_t* dst,
+                          int width);
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+                            ptrdiff_t src_tile_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv,
+                                ptrdiff_t src_tile_stride,
+                                uint8_t* dst_u,
+                                uint8_t* dst_v,
+                                int width);
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
+                               ptrdiff_t src_tile_stride,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width);
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width);
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width);
+void DetileToYUY2_Any_NEON(const uint8_t* src_y,
+                           ptrdiff_t src_y_tile_stride,
+                           const uint8_t* src_uv,
+                           ptrdiff_t src_uv_tile_stride,
+                           uint8_t* dst_yuy2,
+                           int width);
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size);
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
                   uint8_t* dst_uv,
@@ -1579,6 +2421,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width);
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width);
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
@@ -1587,7 +2433,11 @@ void MergeUVRow_MSA(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
-void MergeUVRow_MMI(const uint8_t* src_u,
+void MergeUVRow_LSX(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
@@ -1599,6 +2449,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
+void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void MergeUVRow_Any_NEON(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -1607,11 +2461,39 @@ void MergeUVRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
-void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+void MergeUVRow_Any_LSX(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
 
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
@@ -1627,7 +2509,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
@@ -1642,11 +2524,6 @@ void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           int width);
-void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
-                         uint8_t* dst_r,
-                         uint8_t* dst_g,
-                         uint8_t* dst_b,
-                         int width);
 
 void MergeRGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
@@ -1663,7 +2540,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width);
-void MergeRGBRow_MMI(const uint8_t* src_r,
+void MergeRGBRow_RVV(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
@@ -1678,31 +2555,471 @@ void MergeRGBRow_Any_NEON(const uint8_t* src_r,
                           const uint8_t* src_b,
                           uint8_t* dst_rgb,
                           int width);
-void MergeRGBRow_Any_MMI(const uint8_t* src_r,
-                         const uint8_t* src_g,
-                         const uint8_t* src_b,
-                         uint8_t* dst_rgb,
+void MergeARGBRow_C(const uint8_t* src_r,
+                    const uint8_t* src_g,
+                    const uint8_t* src_b,
+                    const uint8_t* src_a,
+                    uint8_t* dst_argb,
+                    int width);
+void MergeARGBRow_SSE2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeARGBRow_AVX2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeARGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width);
+void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeARGBRow_Any_AVX2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeARGBRow_Any_NEON(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           const uint8_t* a_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void SplitARGBRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_r,
+                    uint8_t* dst_g,
+                    uint8_t* dst_b,
+                    uint8_t* dst_a,
+                    int width);
+void SplitARGBRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width);
+void SplitARGBRow_SSSE3(const uint8_t* src_argb,
+                        uint8_t* dst_r,
+                        uint8_t* dst_g,
+                        uint8_t* dst_b,
+                        uint8_t* dst_a,
+                        int width);
+void SplitARGBRow_AVX2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width);
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width);
+void SplitARGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width);
+void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           uint8_t* dst_a,
+                           int width);
+void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_r,
+                            uint8_t* dst_g,
+                            uint8_t* dst_b,
+                            uint8_t* dst_a,
+                            int width);
+void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           uint8_t* dst_a,
+                           int width);
+void SplitARGBRow_Any_NEON(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           uint8_t* dst_a,
+                           int width);
+void MergeXRGBRow_C(const uint8_t* src_r,
+                    const uint8_t* src_g,
+                    const uint8_t* src_b,
+                    uint8_t* dst_argb,
+                    int width);
+void MergeXRGBRow_SSE2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeXRGBRow_AVX2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width);
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width);
+void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeXRGBRow_Any_NEON(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void SplitXRGBRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_r,
+                    uint8_t* dst_g,
+                    uint8_t* dst_b,
+                    int width);
+void SplitXRGBRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
+                        uint8_t* dst_r,
+                        uint8_t* dst_g,
+                        uint8_t* dst_b,
+                        int width);
+void SplitXRGBRow_AVX2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitXRGBRow_RVV(const uint8_t* src_rgba,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
+void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_r,
+                            uint8_t* dst_g,
+                            uint8_t* dst_b,
+                            int width);
+void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+
+void MergeXR30Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint8_t* dst_ar30,
+                    int depth,
+                    int width);
+void MergeAR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    const uint16_t* src_a,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width);
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         const uint16_t* src_a,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width);
+void MergeXR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width);
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         uint8_t* dst_argb,
+                         int depth,
                          int width);
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width);
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width);
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width);
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width);
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width);
+void MergeXR30Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint8_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeAR64Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           const uint16_t* a_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXR64Row_Any_AVX2(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                const uint16_t* a_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXR30Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint8_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf,
+                              const uint16_t* g_buf,
+                              const uint16_t* b_buf,
+                              uint8_t* dst_ptr,
+                              int depth,
+                              int width);
+void MergeAR64Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           const uint16_t* a_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                const uint16_t* a_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
+void MergeXR64Row_Any_NEON(const uint16_t* r_buf,
+                           const uint16_t* g_buf,
+                           const uint16_t* b_buf,
+                           uint16_t* dst_ptr,
+                           int depth,
+                           int width);
+void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf,
+                                const uint16_t* g_buf,
+                                const uint16_t* b_buf,
+                                uint8_t* dst_ptr,
+                                int depth,
+                                int width);
 
 void MergeUVRow_16_C(const uint16_t* src_u,
                      const uint16_t* src_v,
                      uint16_t* dst_uv,
-                     int scale, /* 64 for 10 bit */
+                     int depth,
                      int width);
 void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         const uint16_t* src_v,
                         uint16_t* dst_uv,
-                        int scale,
+                        int depth,
+                        int width);
+void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
+                            const uint16_t* src_v,
+                            uint16_t* dst_uv,
+                            int depth,
+                            int width);
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
                         int width);
+void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
+                            const uint16_t* src_v,
+                            uint16_t* dst_uv,
+                            int depth,
+                            int width);
+
+void SplitUVRow_16_C(const uint16_t* src_uv,
+                     uint16_t* dst_u,
+                     uint16_t* dst_v,
+                     int depth,
+                     int width);
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width);
+void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv,
+                            uint16_t* dst_u,
+                            uint16_t* dst_v,
+                            int depth,
+                            int width);
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width);
+void SplitUVRow_16_Any_NEON(const uint16_t* src_uv,
+                            uint16_t* dst_u,
+                            uint16_t* dst_v,
+                            int depth,
+                            int width);
 
-void MultiplyRow_16_AVX2(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width);
 void MultiplyRow_16_C(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width);
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int scale,
+                             int width);
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int scale,
+                             int width);
+
+void DivideRow_16_C(const uint16_t* src_y,
+                    uint16_t* dst_y,
+                    int scale,
+                    int width);
+void DivideRow_16_AVX2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void DivideRow_16_Any_AVX2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           int scale,
+                           int width);
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           int scale,
+                           int width);
 
 void Convert8To16Row_C(const uint8_t* src_y,
                        uint16_t* dst_y,
@@ -1745,12 +3062,21 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
                               uint8_t* dst_ptr,
                               int scale,
                               int width);
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int scale,
+                              int width);
 
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -1761,16 +3087,12 @@ void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
 void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
 void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
 
 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
@@ -1785,7 +3107,10 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width);
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width);
 void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
@@ -1800,31 +3125,30 @@ void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int width);
-void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  int width);
 
 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width);
 void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
 void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int width);
-void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
 
 void SetRow_C(uint8_t* dst, uint8_t v8, int width);
 void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_LSX(uint8_t* dst, uint8_t v8, int width);
 void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
 void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_LSX(uint8_t* dst_ptr, uint8_t v32, int width);
 
 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
@@ -1832,6 +3156,8 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
 void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
 void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
 void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_LSX(uint8_t* dst_ptr, uint32_t v32, int width);
 
 // ARGBShufflers for BGRAToARGB etc.
 void ARGBShuffleRow_C(const uint8_t* src_argb,
@@ -1854,10 +3180,14 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
 void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               const uint8_t* param,
@@ -1874,15 +3204,20 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
-void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
+void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
 
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1901,42 +3236,61 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width);
 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width);
 void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width);
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width);
+void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width);
 void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width);
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width);
+void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width);
 void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width);
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width);
+void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
 void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
 void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
@@ -1956,6 +3310,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
 void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -1985,45 +3342,59 @@ void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
 void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void RGB24ToARGBRow_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void RGB24ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
 void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
-void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void RGB565ToARGBRow_Any_LSX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
+void RGB565ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
 void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB1555ToARGBRow_Any_LSX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+void ARGB1555ToARGBRow_Any_LASX(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
 
 void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+void ARGB4444ToARGBRow_Any_LSX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
+void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
@@ -2040,15 +3411,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width);
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2076,7 +3447,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2089,23 +3460,44 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
                            int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
+                               int width);
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
                                int width);
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                uint32_t dither4,
+                                int width);
 
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
                            uint8_t* dst_rgb,
                            int width);
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
                            uint8_t* dst_rgb,
                            int width);
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width);
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width);
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width);
+
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width);
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width);
 void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -2115,11 +3507,85 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
 void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
 
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+                      uint8_t* dst_ar64,
+                      const uint8_t* shuffler,
+                      int width);
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ar64,
+                         int width);
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ab64,
+                         int width);
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+                         uint8_t* dst_argb,
+                         int width);
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width);
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width);
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width);
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width);
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width);
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width);
+void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int width);
+void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int width);
+void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            int width);
+void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@@ -2131,7 +3597,7 @@ void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 void I444ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
@@ -2139,6 +3605,12 @@ void I444ToARGBRow_C(const uint8_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
 void I422ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -2163,6 +3635,51 @@ void I210ToARGBRow_C(const uint16_t* src_y,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I212ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I212ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I444AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
 void I422AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -2207,6 +3724,27 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void P210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void P410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+
 void I422ToRGBARow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -2243,6 +3781,12 @@ void I422ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGBARow_AVX2(const uint8_t* y_buf,
                         const uint8_t* u_buf,
                         const uint8_t* v_buf,
@@ -2261,18 +3805,18 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I444ToRGB24Row_AVX2(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
-                         uint8_t* dst_argb,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I444ToARGBRow_AVX2(const uint8_t* y_buf,
-                        const uint8_t* u_buf,
-                        const uint8_t* v_buf,
-                        uint8_t* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@@ -2298,6 +3842,44 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
                          uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                              const uint16_t* u_buf,
+                              const uint16_t* v_buf,
+                              const uint16_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                              const uint16_t* u_buf,
+                              const uint16_t* v_buf,
+                              const uint16_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I422ToAR30Row_AVX2(const uint8_t* y_buf,
                         const uint8_t* u_buf,
                         const uint8_t* v_buf,
@@ -2316,6 +3898,58 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf,
                         uint8_t* dst_ar30,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I212ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I212ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I410ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                               const uint8_t* u_buf,
                               const uint8_t* v_buf,
@@ -2365,6 +3999,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_yuv24,
+                          int width);
 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
@@ -2400,6 +4038,48 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+
+void P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* uv_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void P210ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+
 void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@@ -2460,6 +4140,12 @@ void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -2472,12 +4158,24 @@ void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -2502,6 +4200,44 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                                  const uint16_t* u_buf,
+                                  const uint16_t* v_buf,
+                                  const uint16_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                                  const uint16_t* u_buf,
+                                  const uint16_t* v_buf,
+                                  const uint16_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
 void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -2520,6 +4256,58 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                                 const uint16_t* u_buf,
+                                 const uint16_t* v_buf,
+                                 const uint16_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                                  const uint8_t* u_buf,
+                                  const uint8_t* v_buf,
+                                  const uint8_t* a_buf,
+                                  uint8_t* dst_ptr,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
 void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
                                   const uint8_t* u_buf,
                                   const uint8_t* v_buf,
@@ -2574,9 +4362,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                                const uint8_t* uv_buf,
@@ -2604,6 +4396,46 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -2659,30 +4491,61 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              const struct YuvConstants* yuvconstants,
                              int width);
 
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I400ToARGBRow_LSX(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* param,
                             int width);
 void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* param,
                             int width);
 void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
+                            const struct YuvConstants* param,
                             int width);
-void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I400ToARGBRow_Any_LSX(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width);
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
@@ -2690,11 +4553,15 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width);
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+void ARGBBlendRow_LSX(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width);
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_RVV(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
+void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width);
@@ -2720,16 +4587,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             int width);
-void BlendPlaneRow_MMI(const uint8_t* src0,
+void BlendPlaneRow_RVV(const uint8_t* src0,
                        const uint8_t* src1,
                        const uint8_t* alpha,
                        uint8_t* dst,
                        int width);
-void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
-                           const uint8_t* u_buf,
-                           const uint8_t* v_buf,
-                           uint8_t* dst_ptr,
-                           int width);
 void BlendPlaneRow_C(const uint8_t* src0,
                      const uint8_t* src1,
                      const uint8_t* alpha,
@@ -2738,11 +4600,11 @@ void BlendPlaneRow_C(const uint8_t* src0,
 
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2750,7 +4612,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2758,7 +4620,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2774,21 +4636,29 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width);
-void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
 
 // ARGB add images.
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
                   const uint8_t* src_argb1,
                   uint8_t* dst_argb,
                   int width);
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -2796,7 +4666,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -2804,7 +4674,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
                          int width);
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width);
@@ -2820,22 +4690,30 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width);
-void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_LSX(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void ARGBAddRow_Any_LASX(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width);
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2843,7 +4721,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2851,7 +4729,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width);
@@ -2867,14 +4745,22 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width);
-void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
 
 void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
@@ -2963,24 +4849,40 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    const uint32_t param,
                                    int width);
-
-void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
+void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
-void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int width);
-void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
-                                   uint8_t* dst_ptr,
-                                   const uint32_t param,
-                                   int width);
+void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
 
 void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             const uint8_t* u_buf,
@@ -2988,12 +4890,25 @@ void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
 void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
                                  const uint8_t* u_buf,
                                  const uint8_t* v_buf,
@@ -3051,9 +4966,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
-                             const uint8_t* src_vu,
-                             uint8_t* dst_yuv24,
+void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              int width);
 void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
@@ -3068,24 +4983,94 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void P210ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P210ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I444ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void I422ToRGBARow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
                                 const uint8_t* u_buf,
                                 const uint8_t* v_buf,
@@ -3093,30 +5078,92 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
                                 uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
+void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
 void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
                                const uint8_t* u_buf,
                                const uint8_t* v_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
                                const uint8_t* u_buf,
                                const uint8_t* v_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
+void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
 void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* uv_buf,
                            uint8_t* dst_ptr,
@@ -3141,12 +5188,55 @@ void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
 
+void NV12ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB565Row_Any_LSX(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToRGB565Row_Any_LASX(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV21ToARGBRow_Any_LSX(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV21ToARGBRow_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void YUY2ToARGBRow_Any_LSX(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_LSX(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       int stride_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -3157,6 +5247,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -3167,90 +5261,130 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width);
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
 void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
                      int src_stride_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+                      int src_stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
 void YUY2ToUVRow_C(const uint8_t* src_yuy2,
                    int src_stride_yuy2,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width);
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
 void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
+                            int stride_yuy2,
+                            uint8_t* dst_uv,
+                            int width);
 void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
 void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
-void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       int stride_uyvy,
@@ -3292,25 +5426,35 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_v,
                          int width);
 void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
                      int src_stride_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+                      int src_stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
 void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 
 void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
 void UYVYToUVRow_C(const uint8_t* src_uyvy,
@@ -3324,7 +5468,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy,
                       int width);
 void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3334,7 +5478,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
                              int width);
 void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3344,7 +5488,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
                              int width);
 void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
-                          int src_stride_ptr,
+                          int src_stride,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
@@ -3353,53 +5497,67 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_v,
                              int width);
 void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr,
                          int src_stride_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
-void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
-void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
-void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
-                   int stride_ayuv,
+                   int src_stride_ayuv,
                    uint8_t* dst_uv,
                    int width);
 void AYUVToVURow_C(const uint8_t* src_ayuv,
-                   int stride_ayuv,
+                   int src_stride_ayuv,
                    uint8_t* dst_vu,
                    int width);
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
-                      int stride_ayuv,
+                      int src_stride_ayuv,
                       uint8_t* dst_uv,
                       int width);
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
-                      int stride_ayuv,
+                      int src_stride_ayuv,
                       uint8_t* dst_vu,
                       int width);
-void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
-                          int stride_ayuv,
-                          uint8_t* dst_uv,
+void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride,
+                          uint8_t* dst_vu,
                           int width);
-void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
-                          int stride_ayuv,
+void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride,
                           uint8_t* dst_vu,
                           int width);
 
@@ -3478,41 +5636,61 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width);
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width);
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
 void I422ToUYVYRow_MSA(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width);
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width);
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
 void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
-void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
 void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
-void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
                            uint8_t* dst_ptr,
                            int width);
+void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
 
 // Effects related row functions.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
@@ -3528,7 +5706,13 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
@@ -3543,9 +5727,12 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
 void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
-void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
 extern const uint32_t fixed_invtbl8[256];
@@ -3569,13 +5756,15 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
 
 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
                           uint8_t* dst_argb,
@@ -3593,7 +5782,7 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width);
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width);
@@ -3632,6 +5821,11 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
                          int interval_size,
                          int interval_offset,
                          int width);
+void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width);
 
 void ARGBShadeRow_C(const uint8_t* src_argb,
                     uint8_t* dst_argb,
@@ -3649,10 +5843,14 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value);
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value);
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
 
 // Used for blur.
 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
@@ -3666,11 +5864,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   const int32_t* previous_cumsum,
                                   int width);
 
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width);
-
 void CumulativeSumToAverageRow_C(const int32_t* tl,
                                  const int32_t* bl,
                                  int w,
@@ -3721,7 +5914,12 @@ void InterpolateRow_MSA(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
-void InterpolateRow_MMI(uint8_t* dst_ptr,
+void InterpolateRow_LSX(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         int width,
@@ -3746,7 +5944,7 @@ void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
                             ptrdiff_t src_stride_ptr,
                             int width,
                             int source_y_fraction);
-void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+void InterpolateRow_Any_LSX(uint8_t* dst_ptr,
                             const uint8_t* src_ptr,
                             ptrdiff_t src_stride_ptr,
                             int width,
@@ -3757,6 +5955,47 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int width,
                          int source_y_fraction);
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int width,
+                            int source_y_fraction);
+void InterpolateRow_16_Any_NEON(uint16_t* dst_ptr,
+                                const uint16_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int width,
+                                int source_y_fraction);
+
+void InterpolateRow_16To8_C(uint8_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int scale,
+                            int width,
+                            int source_y_fraction);
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction);
+void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
+                                   const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int scale,
+                                   int width,
+                                   int source_y_fraction);
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction);
+void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr,
+                                   const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int scale,
+                                   int width,
+                                   int source_y_fraction);
 
 // Sobel images.
 void SobelXRow_C(const uint8_t* src_y0,
@@ -3779,11 +6018,6 @@ void SobelXRow_MSA(const uint8_t* src_y0,
                    const uint8_t* src_y2,
                    uint8_t* dst_sobelx,
                    int width);
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width);
 void SobelYRow_C(const uint8_t* src_y0,
                  const uint8_t* src_y1,
                  uint8_t* dst_sobely,
@@ -3800,10 +6034,6 @@ void SobelYRow_MSA(const uint8_t* src_y0,
                    const uint8_t* src_y1,
                    uint8_t* dst_sobely,
                    int width);
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width);
 void SobelRow_C(const uint8_t* src_sobelx,
                 const uint8_t* src_sobely,
                 uint8_t* dst_argb,
@@ -3820,7 +6050,7 @@ void SobelRow_MSA(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
                   int width);
-void SobelRow_MMI(const uint8_t* src_sobelx,
+void SobelRow_LSX(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
                   int width);
@@ -3840,7 +6070,7 @@ void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
                          const uint8_t* src_sobely,
                          uint8_t* dst_y,
                          int width);
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
                          const uint8_t* src_sobely,
                          uint8_t* dst_y,
                          int width);
@@ -3860,7 +6090,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
                     const uint8_t* src_sobely,
                     uint8_t* dst_argb,
                     int width);
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
+void SobelXYRow_LSX(const uint8_t* src_sobelx,
                     const uint8_t* src_sobely,
                     uint8_t* dst_argb,
                     int width);
@@ -3876,7 +6106,7 @@ void SobelRow_Any_MSA(const uint8_t* y_buf,
                       const uint8_t* uv_buf,
                       uint8_t* dst_ptr,
                       int width);
-void SobelRow_Any_MMI(const uint8_t* y_buf,
+void SobelRow_Any_LSX(const uint8_t* y_buf,
                       const uint8_t* uv_buf,
                       uint8_t* dst_ptr,
                       int width);
@@ -3892,7 +6122,7 @@ void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
-void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+void SobelToPlaneRow_Any_LSX(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
@@ -3908,7 +6138,7 @@ void SobelXYRow_Any_MSA(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
-void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+void SobelXYRow_Any_LSX(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
@@ -3984,6 +6214,14 @@ void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
                           uint16_t* dst_ptr,
                           float param,
                           int width);
+void HalfFloatRow_LSX(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width);
+void HalfFloatRow_Any_LSX(const uint16_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          float param,
+                          int width);
 void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
 void ByteToFloatRow_NEON(const uint8_t* src,
                          float* dst,
@@ -3993,7 +6231,19 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
                              float* dst_ptr,
                              float param,
                              int width);
-
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width);
+// Convert a column of FP16 Half Floats to a row of FP32 Floats
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width);
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width);
 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width,
@@ -4018,16 +6268,35 @@ float ScaleSumSamples_NEON(const float* src,
 void ScaleSamples_C(const float* src, float* dst, float scale, int width);
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
 
-void FloatDivToByteRow_C(const float* src_weights,
-                         const float* src_values,
-                         uint8_t* dst_out,
-                         uint8_t* dst_mask,
-                         int width);
-void FloatDivToByteRow_NEON(const float* src_weights,
-                            const float* src_values,
-                            uint8_t* dst_out,
-                            uint8_t* dst_mask,
-                            int width);
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width);
+
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width);
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width);
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
new file mode 100644
index 00000000..bfe4a344
--- /dev/null
+++ b/include/libyuv/scale.h
@@ -0,0 +1,321 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,      // Point sample; Fastest.
+  kFilterLinear = 1,    // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3        // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+// Returns 0 if successful.
+LIBYUV_API
+int ScalePlane(const uint8_t* src,
+               int src_stride,
+               int src_width,
+               int src_height,
+               uint8_t* dst,
+               int dst_stride,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering);
+
+LIBYUV_API
+int ScalePlane_16(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering);
+
+// Sample is expected to be in the low 12 bits.
+LIBYUV_API
+int ScalePlane_12(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+// Scales a YUV 4:2:2 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
+          LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/files/include/libyuv/scale_argb.h b/include/libyuv/scale_argb.h
index 7641f18e..7641f18e 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/include/libyuv/scale_argb.h
diff --git a/include/libyuv/scale_rgb.h b/include/libyuv/scale_rgb.h
new file mode 100644
index 00000000..d17c39fd
--- /dev/null
+++ b/include/libyuv/scale_rgb.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_RGB_H_
+#define INCLUDE_LIBYUV_SCALE_RGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// RGB can be RAW, RGB24 or YUV24
+// RGB scales 24 bit images by converting a row at a time to ARGB
+// and using ARGB row functions to scale, then convert to RGB.
+// TODO(fbarchard): Allow input/output formats to be specified.
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+             int src_stride_rgb,
+             int src_width,
+             int src_height,
+             uint8_t* dst_rgb,
+             int dst_stride_rgb,
+             int dst_width,
+             int dst_height,
+             enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/files/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 6e207a9c..02ed61ca 100644
--- a/files/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -29,7 +29,10 @@ extern "C" {
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
@@ -72,6 +75,43 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_SSSE3
 #endif
 
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_SSE2
+#define HAS_SCALEROWUP2_LINEAR_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_16_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2_LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&               \
+    (defined(__x86_64__) || defined(__i386__)) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALEROWUP2_LINEAR_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_AVX2
+#define HAS_SCALEROWUP2_LINEAR_12_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_12_AVX2
+#define HAS_SCALEROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+#endif
+
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
@@ -96,6 +136,20 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
+#define HAS_SCALEROWUP2_LINEAR_NEON
+#define HAS_SCALEROWUP2_BILINEAR_NEON
+#define HAS_SCALEROWUP2_LINEAR_12_NEON
+#define HAS_SCALEROWUP2_BILINEAR_12_NEON
+#define HAS_SCALEROWUP2_LINEAR_16_NEON
+#define HAS_SCALEROWUP2_BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_NEON
+#define HAS_SCALEUVROWUP2_LINEAR_16_NEON
+#define HAS_SCALEUVROWUP2_BILINEAR_16_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -111,21 +165,49 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_MSA
 #endif
 
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-#define HAS_FIXEDDIV1_MIPS
-#define HAS_FIXEDDIV_MIPS
-#define HAS_SCALEADDROW_16_MMI
-#define HAS_SCALEADDROW_MMI
-#define HAS_SCALEARGBCOLS_MMI
-#define HAS_SCALEARGBCOLSUP2_MMI
-#define HAS_SCALEARGBROWDOWN2_MMI
-#define HAS_SCALEARGBROWDOWNEVEN_MMI
-#define HAS_SCALECOLS_16_MMI
-#define HAS_SCALECOLS_MMI
-#define HAS_SCALEROWDOWN2_16_MMI
-#define HAS_SCALEROWDOWN2_MMI
-#define HAS_SCALEROWDOWN4_16_MMI
-#define HAS_SCALEROWDOWN4_MMI
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#define HAS_SCALEARGBROWDOWN2_LSX
+#define HAS_SCALEARGBROWDOWNEVEN_LSX
+#define HAS_SCALEROWDOWN2_LSX
+#define HAS_SCALEROWDOWN4_LSX
+#define HAS_SCALEROWDOWN38_LSX
+#define HAS_SCALEFILTERCOLS_LSX
+#define HAS_SCALEADDROW_LSX
+#define HAS_SCALEARGBCOLS_LSX
+#define HAS_SCALEARGBFILTERCOLS_LSX
+#define HAS_SCALEROWDOWN34_LSX
+#endif
+
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
+#define HAS_SCALEADDROW_RVV
+// TODO: Test ScaleARGBRowDownEven_RVV and enable it
+// #define HAS_SCALEARGBROWDOWNEVEN_RVV
+#define HAS_SCALEUVROWDOWN4_RVV
+#define HAS_SCALEUVROWDOWNEVEN_RVV
+#if __riscv_v_intrinsic == 11000
+#define HAS_SCALEARGBROWDOWN2_RVV
+#define HAS_SCALEARGBROWDOWN2BOX_RVV
+#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
+#define HAS_SCALEARGBROWDOWNEVENBOX_RVV
+#define HAS_SCALEROWDOWN2_RVV
+#define HAS_SCALEROWDOWN2BOX_RVV
+#define HAS_SCALEROWDOWN2LINEAR_RVV
+#define HAS_SCALEROWDOWN34_0_BOX_RVV
+#define HAS_SCALEROWDOWN34_1_BOX_RVV
+#define HAS_SCALEROWDOWN34_RVV
+#define HAS_SCALEROWDOWN38_2_BOX_RVV
+#define HAS_SCALEROWDOWN38_3_BOX_RVV
+#define HAS_SCALEROWDOWN38_RVV
+#define HAS_SCALEROWDOWN4_RVV
+#define HAS_SCALEROWDOWN4BOX_RVV
+#define HAS_SCALEROWUP2_BILINEAR_RVV
+#define HAS_SCALEROWUP2_LINEAR_RVV
+#define HAS_SCALEUVROWDOWN2_RVV
+#define HAS_SCALEUVROWDOWN2BOX_RVV
+#define HAS_SCALEUVROWDOWN2LINEAR_RVV
+#define HAS_SCALEUVROWUP2_BILINEAR_RVV
+#define HAS_SCALEUVROWUP2_LINEAR_RVV
+#endif
 #endif
 
 // Scale ARGB vertically with bilinear interpolation.
@@ -155,6 +237,31 @@ void ScalePlaneVertical_16(int src_height,
                            int wpp,
                            enum FilterMode filtering);
 
+void ScalePlaneVertical_16To8(int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint16_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int y,
+                              int dy,
+                              int wpp,
+                              int scale,
+                              enum FilterMode filtering);
+
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering);
+
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width,
                                   int src_height,
@@ -200,6 +307,16 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint16_t* dst,
                         int dst_width);
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale);
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale);
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -208,6 +325,16 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst,
                               int dst_width);
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale);
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale);
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -220,6 +347,16 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint16_t* dst,
                            int dst_width);
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale);
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale);
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
@@ -260,6 +397,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint16_t* d,
                                int dst_width);
+
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width);
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
 void ScaleCols_C(uint8_t* dst_ptr,
                  const uint8_t* src_ptr,
                  int dst_width,
@@ -375,6 +546,87 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
                              int dst_width,
                              int x32,
                              int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int,
+                      int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx);
 
 // Specialized scalers for x86.
 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
@@ -442,6 +694,120 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width);
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr,
+                                     uint16_t* dst_ptr,
+                                     int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
+                                       ptrdiff_t src_stride,
+                                       uint16_t* dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
 void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
@@ -592,16 +958,6 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
                            int dst_width,
                            int x,
                            int dx);
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx);
-void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
-                           const uint8_t* src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx);
 
 // ARGB Row functions
 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -628,6 +984,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst,
                                int dst_width);
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
                            ptrdiff_t src_stride,
                            uint8_t* dst_argb,
@@ -640,15 +1008,15 @@ void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width);
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
                            ptrdiff_t src_stride,
                            uint8_t* dst_argb,
                            int dst_width);
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8_t* dst_argb,
                                  int dst_width);
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width);
@@ -688,15 +1056,15 @@ void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8_t* dst_ptr,
                                   int dst_width);
-void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2_Any_LSX(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width);
-void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8_t* dst_ptr,
                                      int dst_width);
-void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint8_t* dst_ptr,
                                   int dst_width);
@@ -730,12 +1098,22 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width);
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               int32_t src_stepx,
                               uint8_t* dst_argb,
                               int dst_width);
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
                                  ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8_t* dst_argb,
@@ -770,16 +1148,285 @@ void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
                                      int src_stepx,
                                      uint8_t* dst_ptr,
                                      int dst_width);
-void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDownEven_Any_LSX(const uint8_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   int32_t src_stepx,
                                   uint8_t* dst_ptr,
                                   int dst_width);
-void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_uv,
+                           int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDown2_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_uv,
+                              int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_uv,
+                                 int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int src_stepx,
+                                uint8_t* dst_uv,
+                                int dst_width);
+void ScaleUVRowDown4_RVV(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int32_t src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width);
+void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int32_t src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_uv,
+                               int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
                                      ptrdiff_t src_stride,
                                      int src_stepx,
                                      uint8_t* dst_ptr,
                                      int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    int src_stepx,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                int32_t src_stepx,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr,
+                                       uint16_t* dst_ptr,
+                                       int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t* dst_ptr,
+                                         ptrdiff_t dst_stride,
+                                         int dst_width);
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@@ -891,6 +1538,55 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    int dst_width);
 
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
 void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
 void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
                           uint16_t* dst_ptr,
@@ -1012,93 +1708,184 @@ void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int dst_width);
 
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
                        uint8_t* dst,
                        int dst_width);
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width);
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
                              int dst_width);
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width);
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst,
                           int dst_width);
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width);
-void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst,
-                              int dst_width);
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
                        uint8_t* dst,
                        int dst_width);
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width);
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst,
                           int dst_width);
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width);
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width);
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx);
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
                          int dst_width,
                          int x,
                          int dx);
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx);
-
-void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown2_Any_LSX(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width);
-void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Linear_Any_LSX(const uint8_t* src_ptr,
                                  ptrdiff_t src_stride,
                                  uint8_t* dst_ptr,
                                  int dst_width);
-void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown2Box_Any_LSX(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width);
-void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4_Any_LSX(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width);
-void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown4Box_Any_LSX(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width);
-void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+void ScaleRowDown38_Any_LSX(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_LSX(const uint8_t* src_ptr,
                          uint16_t* dst_ptr,
                          int src_width);
+void ScaleFilterCols_Any_LSX(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_Any_LSX(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_Any_LSX(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleRowDown34_Any_LSX(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width);
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/include/libyuv/scale_uv.h b/include/libyuv/scale_uv.h
new file mode 100644
index 00000000..8e74e319
--- /dev/null
+++ b/include/libyuv/scale_uv.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering);
+
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/files/include/libyuv/version.h b/include/libyuv/version.h
index 741ef34d..a9c54400 100644
--- a/files/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1732
+#define LIBYUV_VERSION 1883
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/include/libyuv/video_common.h b/include/libyuv/video_common.h
index ffcbdbf1..32b8a521 100644
--- a/files/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -50,7 +50,7 @@ extern "C" {
 // Secondary formats are converted in 2 steps.
 // Auxilliary formats call primary converters.
 enum FourCC {
-  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -59,17 +59,20 @@ enum FourCC {
   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
+  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
+  FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422
 
-  // 1 Secondary YUV format: row biplanar.
+  // 1 Secondary YUV format: row biplanar.  deprecated.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
 
-  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+  // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
   FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
   FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
+  FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
+  FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
   FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -80,15 +83,36 @@ enum FourCC {
   // 1 Primary Compressed YUV format.
   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-  // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
   FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
-  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
-  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // unofficial fourcc
+  FOURCC_J420 =
+      FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J422 =
+      FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J444 =
+      FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_J400 =
+      FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
+  FOURCC_F420 = FOURCC('F', '4', '2', '0'),  // bt.709 full, unofficial fourcc
+  FOURCC_F422 = FOURCC('F', '4', '2', '2'),  // bt.709 full, unofficial fourcc
+  FOURCC_F444 = FOURCC('F', '4', '4', '4'),  // bt.709 full, unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
+  FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
+  FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
+  FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+  FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+  FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
+  FOURCC_F010 = FOURCC('F', '0', '1', '0'),  // bt.709 full range 10 bit 420
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
+  FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
+  FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
+  FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
+  FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
+  FOURCC_P010 = FOURCC('P', '0', '1', '0'),
+  FOURCC_P210 = FOURCC('P', '2', '1', '0'),
 
   // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -133,7 +157,7 @@ enum FourCCBpp {
   FOURCC_BPP_NV12 = 12,
   FOURCC_BPP_YUY2 = 16,
   FOURCC_BPP_UYVY = 16,
-  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_M420 = 12,  // deprecated
   FOURCC_BPP_Q420 = 12,
   FOURCC_BPP_ARGB = 32,
   FOURCC_BPP_BGRA = 32,
@@ -141,6 +165,8 @@ enum FourCCBpp {
   FOURCC_BPP_RGBA = 32,
   FOURCC_BPP_AR30 = 32,
   FOURCC_BPP_AB30 = 32,
+  FOURCC_BPP_AR64 = 64,
+  FOURCC_BPP_AB64 = 64,
   FOURCC_BPP_24BG = 24,
   FOURCC_BPP_RAW = 24,
   FOURCC_BPP_RGBP = 16,
@@ -158,7 +184,12 @@ enum FourCCBpp {
   FOURCC_BPP_J400 = 8,
   FOURCC_BPP_H420 = 12,
   FOURCC_BPP_H422 = 16,
-  FOURCC_BPP_H010 = 24,
+  FOURCC_BPP_I010 = 15,
+  FOURCC_BPP_I210 = 20,
+  FOURCC_BPP_H010 = 15,
+  FOURCC_BPP_H210 = 20,
+  FOURCC_BPP_P010 = 15,
+  FOURCC_BPP_P210 = 20,
   FOURCC_BPP_MJPG = 0,  // 0 means unknown.
   FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
diff --git a/infra/config/OWNERS b/infra/config/OWNERS
new file mode 100644
index 00000000..2c4f90a0
--- /dev/null
+++ b/infra/config/OWNERS
@@ -0,0 +1,3 @@
+fbarchard@chromium.org
+mbonadei@chromium.org
+jansson@google.com
diff --git a/infra/config/PRESUBMIT.py b/infra/config/PRESUBMIT.py
new file mode 100644
index 00000000..f79e08ad
--- /dev/null
+++ b/infra/config/PRESUBMIT.py
@@ -0,0 +1,13 @@
+# Copyright 2018 The PDFium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+USE_PYTHON3 = True
+
+
+def CheckChangeOnUpload(input_api, output_api):
+  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
+
+
+def CheckChangeOnCommit(input_api, output_api):
+  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/infra/config/README.md b/infra/config/README.md
new file mode 100644
index 00000000..e5e3b5f8
--- /dev/null
+++ b/infra/config/README.md
@@ -0,0 +1,2 @@
+This folder contains libyuv project-wide configurations
+for chrome-infra services.
diff --git a/files/codereview.settings b/infra/config/codereview.settings
index 00ba1d37..6d742273 100644
--- a/files/codereview.settings
+++ b/infra/config/codereview.settings
@@ -1,6 +1,6 @@
-# This file is used by git cl to get repository specific information.
+# This file is used by gcl and git-cl to get repository specific information.
 CODE_REVIEW_SERVER: codereview.chromium.org
-GERRIT_HOST: True
 PROJECT: libyuv
-TRY_ON_UPLOAD: False
+GERRIT_HOST: True
 VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
+
diff --git a/infra/config/commit-queue.cfg b/infra/config/commit-queue.cfg
new file mode 100644
index 00000000..4a8d77f4
--- /dev/null
+++ b/infra/config/commit-queue.cfg
@@ -0,0 +1,143 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see Config message:
+#   https://luci-config.appspot.com/schemas/projects:commit-queue.cfg
+
+cq_status_host: "chromium-cq-status.appspot.com"
+submit_options {
+  max_burst: 4
+  burst_delay {
+    seconds: 480
+  }
+}
+config_groups {
+  name: "config"
+  gerrit {
+    url: "https://chromium-review.googlesource.com"
+    projects {
+      name: "libyuv/libyuv"
+      ref_regexp: "refs/heads/infra/config"
+    }
+  }
+  verifiers {
+    gerrit_cq_ability {
+      committer_list: "project-libyuv-committers"
+      dry_run_access_list: "project-libyuv-tryjob-access"
+    }
+    tryjob {
+      builders {
+        name: "libyuv/try/presubmit"
+      }
+      retry_config {
+        single_quota: 1
+        global_quota: 2
+        failure_weight: 1
+        transient_failure_weight: 1
+        timeout_weight: 2
+      }
+    }
+  }
+}
+config_groups {
+  name: "master"
+  gerrit {
+    url: "https://chromium-review.googlesource.com"
+    projects {
+      name: "libyuv/libyuv"
+      ref_regexp: "refs/heads/main"
+      ref_regexp: "refs/heads/master"
+    }
+  }
+  verifiers {
+    gerrit_cq_ability {
+      committer_list: "project-libyuv-committers"
+      dry_run_access_list: "project-libyuv-tryjob-access"
+    }
+    tryjob {
+      builders {
+        name: "libyuv/try/android"
+        experiment_percentage: 100
+      }
+      builders {
+        name: "libyuv/try/android_arm64"
+        experiment_percentage: 100
+      }
+      builders {
+        name: "libyuv/try/android_rel"
+        experiment_percentage: 100
+      }
+      builders {
+        name: "libyuv/try/android_x64"
+      }
+      builders {
+        name: "libyuv/try/android_x86"
+      }
+      builders {
+        name: "libyuv/try/ios_arm64"
+      }
+      builders {
+        name: "libyuv/try/ios_arm64_rel"
+      }
+      builders {
+        name: "libyuv/try/linux"
+      }
+      builders {
+        name: "libyuv/try/linux_asan"
+      }
+      builders {
+        name: "libyuv/try/linux_gcc"
+        experiment_percentage: 100
+      }
+      builders {
+        name: "libyuv/try/linux_msan"
+      }
+      builders {
+        name: "libyuv/try/linux_rel"
+      }
+      builders {
+        name: "libyuv/try/linux_tsan2"
+      }
+      builders {
+        name: "libyuv/try/linux_ubsan"
+      }
+      builders {
+        name: "libyuv/try/linux_ubsan_vptr"
+      }
+      builders {
+        name: "libyuv/try/mac"
+      }
+      builders {
+        name: "libyuv/try/mac_asan"
+      }
+      builders {
+        name: "libyuv/try/mac_rel"
+      }
+      builders {
+        name: "libyuv/try/win"
+      }
+      builders {
+        name: "libyuv/try/win_clang"
+      }
+      builders {
+        name: "libyuv/try/win_clang_rel"
+      }
+      builders {
+        name: "libyuv/try/win_rel"
+      }
+      builders {
+        name: "libyuv/try/win_x64_clang_rel"
+      }
+      builders {
+        name: "libyuv/try/win_x64_rel"
+      }
+      retry_config {
+        single_quota: 1
+        global_quota: 2
+        failure_weight: 1
+        transient_failure_weight: 1
+        timeout_weight: 2
+      }
+    }
+  }
+}
diff --git a/infra/config/cr-buildbucket.cfg b/infra/config/cr-buildbucket.cfg
new file mode 100644
index 00000000..7415851b
--- /dev/null
+++ b/infra/config/cr-buildbucket.cfg
@@ -0,0 +1,1704 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see BuildbucketCfg message:
+#   https://luci-config.appspot.com/schemas/projects:buildbucket.cfg
+
+buckets {
+  name: "ci"
+  acls {
+    role: WRITER
+    group: "project-libyuv-admins"
+  }
+  acls {
+    group: "all"
+  }
+  swarming {
+    builders {
+      name: "Android ARM64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android Tester ARM32 Debug (Nexus 5X)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android Tester ARM32 Release (Nexus 5X)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android Tester ARM64 Debug (Nexus 5X)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android32 x86 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Android64 x64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux Asan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux MSan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux Tsan v2"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux UBSan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux UBSan vptr"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux32 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux32 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Linux64 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Mac Asan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Mac64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Mac64 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win32 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win32 Debug (Clang)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win32 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win32 Release (Clang)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win64 Debug (Clang)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win64 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "Win64 Release (Clang)"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "iOS ARM64 Debug"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "iOS ARM64 Release"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.ci"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-trusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "client.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+  }
+}
+buckets {
+  name: "cron"
+  acls {
+    role: WRITER
+    group: "project-libyuv-admins"
+  }
+  acls {
+    group: "all"
+  }
+  swarming {
+    builders {
+      name: "DEPS Autoroller"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Linux"
+      dimensions: "pool:luci.webrtc.cron"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "recipe": "libyuv/roll_deps"'
+        '}'
+      execution_timeout_secs: 7200
+      build_numbers: YES
+      service_account: "libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+  }
+}
+buckets {
+  name: "try"
+  acls {
+    role: WRITER
+    group: "project-libyuv-admins"
+  }
+  acls {
+    group: "all"
+  }
+  acls {
+    role: SCHEDULER
+    group: "project-libyuv-tryjob-access"
+  }
+  acls {
+    role: SCHEDULER
+    group: "service-account-cq"
+  }
+  swarming {
+    builders {
+      name: "android"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "android_arm64"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "android_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "device_type:walleye"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "android_x64"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "android_x86"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "ios_arm64"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "ios_arm64_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_asan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_gcc"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_msan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_tsan2"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_ubsan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "linux_ubsan_vptr"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "mac"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "mac_asan"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "mac_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Mac-12"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "presubmit"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Ubuntu-18.04"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "run_presubmit",'
+        '  "repo_name": "libyuv",'
+        '  "runhooks": true'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win_clang"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win_clang_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win_x64_clang_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+    builders {
+      name: "win_x64_rel"
+      swarming_host: "chromium-swarm.appspot.com"
+      swarming_tags: "vpython:native-python-wrapper"
+      dimensions: "cores:8"
+      dimensions: "cpu:x86-64"
+      dimensions: "os:Windows-10"
+      dimensions: "pool:luci.flex.try"
+      exe {
+        cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build"
+        cipd_version: "refs/heads/main"
+        cmd: "luciexe"
+      }
+      properties:
+        '{'
+        '  "$build/reclient": {'
+        '    "instance": "rbe-webrtc-untrusted",'
+        '    "metrics_project": "chromium-reclient-metrics"'
+        '  },'
+        '  "builder_group": "tryserver.libyuv",'
+        '  "recipe": "libyuv/libyuv"'
+        '}'
+      execution_timeout_secs: 10800
+      build_numbers: YES
+      service_account: "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+      experiments {
+        key: "luci.recipes.use_python3"
+        value: 100
+      }
+    }
+  }
+}
diff --git a/infra/config/luci-logdog.cfg b/infra/config/luci-logdog.cfg
new file mode 100644
index 00000000..adc75bef
--- /dev/null
+++ b/infra/config/luci-logdog.cfg
@@ -0,0 +1,9 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectConfig message:
+#   https://luci-config.appspot.com/schemas/projects:luci-logdog.cfg
+
+reader_auth_groups: "all"
+writer_auth_groups: "luci-logdog-chromium-writers"
+archive_gs_bucket: "chromium-luci-logdog"
diff --git a/infra/config/luci-milo.cfg b/infra/config/luci-milo.cfg
new file mode 100644
index 00000000..baf786f2
--- /dev/null
+++ b/infra/config/luci-milo.cfg
@@ -0,0 +1,246 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see Project message:
+#   https://luci-config.appspot.com/schemas/projects:luci-milo.cfg
+
+consoles {
+  id: "main"
+  name: "libyuv Main Console"
+  repo_url: "https://chromium.googlesource.com/libyuv/libyuv"
+  refs: "regexp:refs/heads/main"
+  manifest_name: "REVISION"
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android ARM64 Debug"
+    category: "Android|Builder"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android Debug"
+    category: "Android|Builder"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android Release"
+    category: "Android|Builder"
+    short_name: "rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android32 x86 Debug"
+    category: "Android|Builder|x86"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android64 x64 Debug"
+    category: "Android|Builder|x64"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android Tester ARM32 Debug (Nexus 5X)"
+    category: "Android|Tester|ARM 32"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android Tester ARM32 Release (Nexus 5X)"
+    category: "Android|Tester|ARM 32"
+    short_name: "rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Android Tester ARM64 Debug (Nexus 5X)"
+    category: "Android|Tester|ARM 64"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux Asan"
+    category: "Linux"
+    short_name: "asan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux MSan"
+    category: "Linux"
+    short_name: "msan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux Tsan v2"
+    category: "Linux"
+    short_name: "tsan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux UBSan"
+    category: "Linux|UBSan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux UBSan vptr"
+    category: "Linux|UBSan"
+    short_name: "vptr"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux32 Debug"
+    category: "Linux|32"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux32 Release"
+    category: "Linux|32"
+    short_name: "rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux64 Debug"
+    category: "Linux|64"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Linux64 Release"
+    category: "Linux|64"
+    short_name: "rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Mac Asan"
+    category: "Mac"
+    short_name: "asan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Mac64 Debug"
+    category: "Mac"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Mac64 Release"
+    category: "Mac"
+    short_name: "rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win32 Debug"
+    category: "Win|32|Debug"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win32 Debug (Clang)"
+    category: "Win|32|Debug"
+    short_name: "clg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win32 Release"
+    category: "Win|32|Release"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win32 Release (Clang)"
+    category: "Win|32|Release"
+    short_name: "clg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win64 Debug"
+    category: "Win|64|Debug"
+    short_name: "clg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win64 Debug (Clang)"
+    category: "Win|64|Debug"
+    short_name: "clg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win64 Release"
+    category: "Win|64|Release"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/Win64 Release (Clang)"
+    category: "Win|64|Release"
+    short_name: "clg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/iOS ARM64 Debug"
+    category: "iOS|ARM64"
+    short_name: "dbg"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.ci/iOS ARM64 Release"
+    category: "iOS|ARM64"
+    short_name: "rel"
+  }
+  include_experimental_builds: true
+}
+consoles {
+  id: "cron"
+  name: "Cron"
+  builders {
+    name: "buildbucket/luci.libyuv.cron/DEPS Autoroller"
+  }
+  builder_view_only: true
+}
+consoles {
+  id: "try"
+  name: "libyuv Try Builders"
+  builders {
+    name: "buildbucket/luci.libyuv.try/android"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/android_arm64"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/android_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/android_x64"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/android_x86"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/ios_arm64"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/ios_arm64_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_asan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_gcc"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_msan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_tsan2"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_ubsan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/linux_ubsan_vptr"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/mac"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/mac_asan"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/mac_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win_clang"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win_clang_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win_x64_clang_rel"
+  }
+  builders {
+    name: "buildbucket/luci.libyuv.try/win_x64_rel"
+  }
+  builder_view_only: true
+}
+logo_url: "https://storage.googleapis.com/chrome-infra-public/logo/libyuv-logo.png"
diff --git a/infra/config/luci-scheduler.cfg b/infra/config/luci-scheduler.cfg
new file mode 100644
index 00000000..0ec5dd0e
--- /dev/null
+++ b/infra/config/luci-scheduler.cfg
@@ -0,0 +1,385 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectConfig message:
+#   https://luci-config.appspot.com/schemas/projects:luci-scheduler.cfg
+
+job {
+  id: "Android ARM64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android ARM64 Debug"
+  }
+}
+job {
+  id: "Android Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android Debug"
+  }
+}
+job {
+  id: "Android Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android Release"
+  }
+}
+job {
+  id: "Android Tester ARM32 Debug (Nexus 5X)"
+  realm: "ci"
+  acls {
+    role: TRIGGERER
+    granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android Tester ARM32 Debug (Nexus 5X)"
+  }
+}
+job {
+  id: "Android Tester ARM32 Release (Nexus 5X)"
+  realm: "ci"
+  acls {
+    role: TRIGGERER
+    granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android Tester ARM32 Release (Nexus 5X)"
+  }
+}
+job {
+  id: "Android Tester ARM64 Debug (Nexus 5X)"
+  realm: "ci"
+  acls {
+    role: TRIGGERER
+    granted_to: "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android Tester ARM64 Debug (Nexus 5X)"
+  }
+}
+job {
+  id: "Android32 x86 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android32 x86 Debug"
+  }
+}
+job {
+  id: "Android64 x64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Android64 x64 Debug"
+  }
+}
+job {
+  id: "DEPS Autoroller"
+  realm: "cron"
+  schedule: "0 14 * * *"
+  acl_sets: "cron"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "cron"
+    builder: "DEPS Autoroller"
+  }
+}
+job {
+  id: "Linux Asan"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux Asan"
+  }
+}
+job {
+  id: "Linux MSan"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux MSan"
+  }
+}
+job {
+  id: "Linux Tsan v2"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux Tsan v2"
+  }
+}
+job {
+  id: "Linux UBSan"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux UBSan"
+  }
+}
+job {
+  id: "Linux UBSan vptr"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux UBSan vptr"
+  }
+}
+job {
+  id: "Linux32 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux32 Debug"
+  }
+}
+job {
+  id: "Linux32 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux32 Release"
+  }
+}
+job {
+  id: "Linux64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux64 Debug"
+  }
+}
+job {
+  id: "Linux64 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Linux64 Release"
+  }
+}
+job {
+  id: "Mac Asan"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Mac Asan"
+  }
+}
+job {
+  id: "Mac64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Mac64 Debug"
+  }
+}
+job {
+  id: "Mac64 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Mac64 Release"
+  }
+}
+job {
+  id: "Win32 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win32 Debug"
+  }
+}
+job {
+  id: "Win32 Debug (Clang)"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win32 Debug (Clang)"
+  }
+}
+job {
+  id: "Win32 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win32 Release"
+  }
+}
+job {
+  id: "Win32 Release (Clang)"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win32 Release (Clang)"
+  }
+}
+job {
+  id: "Win64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win64 Debug"
+  }
+}
+job {
+  id: "Win64 Debug (Clang)"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win64 Debug (Clang)"
+  }
+}
+job {
+  id: "Win64 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win64 Release"
+  }
+}
+job {
+  id: "Win64 Release (Clang)"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "Win64 Release (Clang)"
+  }
+}
+job {
+  id: "iOS ARM64 Debug"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "iOS ARM64 Debug"
+  }
+}
+job {
+  id: "iOS ARM64 Release"
+  realm: "ci"
+  acl_sets: "ci"
+  buildbucket {
+    server: "cr-buildbucket.appspot.com"
+    bucket: "ci"
+    builder: "iOS ARM64 Release"
+  }
+}
+trigger {
+  id: "master-gitiles-trigger"
+  realm: "ci"
+  acl_sets: "ci"
+  triggers: "Android ARM64 Debug"
+  triggers: "Android Debug"
+  triggers: "Android Release"
+  triggers: "Android32 x86 Debug"
+  triggers: "Android64 x64 Debug"
+  triggers: "Linux Asan"
+  triggers: "Linux MSan"
+  triggers: "Linux Tsan v2"
+  triggers: "Linux UBSan"
+  triggers: "Linux UBSan vptr"
+  triggers: "Linux32 Debug"
+  triggers: "Linux32 Release"
+  triggers: "Linux64 Debug"
+  triggers: "Linux64 Release"
+  triggers: "Mac Asan"
+  triggers: "Mac64 Debug"
+  triggers: "Mac64 Release"
+  triggers: "Win32 Debug"
+  triggers: "Win32 Debug (Clang)"
+  triggers: "Win32 Release"
+  triggers: "Win32 Release (Clang)"
+  triggers: "Win64 Debug"
+  triggers: "Win64 Debug (Clang)"
+  triggers: "Win64 Release"
+  triggers: "Win64 Release (Clang)"
+  triggers: "iOS ARM64 Debug"
+  triggers: "iOS ARM64 Release"
+  gitiles {
+    repo: "https://chromium.googlesource.com/libyuv/libyuv"
+    refs: "regexp:refs/heads/main"
+  }
+}
+acl_sets {
+  name: "ci"
+  acls {
+    role: OWNER
+    granted_to: "group:project-libyuv-admins"
+  }
+  acls {
+    granted_to: "group:all"
+  }
+}
+acl_sets {
+  name: "cron"
+  acls {
+    role: OWNER
+    granted_to: "group:project-libyuv-admins"
+  }
+  acls {
+    granted_to: "group:all"
+  }
+}
diff --git a/infra/config/main.star b/infra/config/main.star
new file mode 100755
index 00000000..e83afe4f
--- /dev/null
+++ b/infra/config/main.star
@@ -0,0 +1,344 @@
+#!/usr/bin/env lucicfg
+# https://chromium.googlesource.com/infra/luci/luci-go/+/master/lucicfg/doc/
+
+"""LUCI project configuration for libyuv CQ and CI."""
+
+lucicfg.check_version("1.30.9")
+
+LIBYUV_GIT = "https://chromium.googlesource.com/libyuv/libyuv"
+LIBYUV_GERRIT = "https://chromium-review.googlesource.com/libyuv/libyuv"
+
+RECLIENT_CI = {
+    "instance": "rbe-webrtc-trusted",
+    "metrics_project": "chromium-reclient-metrics",
+}
+
+RECLIENT_CQ = {
+    "instance": "rbe-webrtc-untrusted",
+    "metrics_project": "chromium-reclient-metrics",
+}
+
+# Use LUCI Scheduler BBv2 names and add Scheduler realms configs.
+lucicfg.enable_experiment("crbug.com/1182002")
+
+luci.builder.defaults.experiments.set(
+    {
+        "luci.recipes.use_python3": 100,
+    },
+)
+
+lucicfg.config(
+    lint_checks = ["default"],
+    config_dir = ".",
+    tracked_files = [
+        "commit-queue.cfg",
+        "cr-buildbucket.cfg",
+        "luci-logdog.cfg",
+        "luci-milo.cfg",
+        "luci-scheduler.cfg",
+        "project.cfg",
+        "realms.cfg",
+    ],
+)
+
+# Generates project.cfg
+
+luci.project(
+    name = "libyuv",
+    buildbucket = "cr-buildbucket.appspot.com",
+    logdog = "luci-logdog.appspot.com",
+    milo = "luci-milo.appspot.com",
+    notify = "luci-notify.appspot.com",
+    scheduler = "luci-scheduler.appspot.com",
+    swarming = "chromium-swarm.appspot.com",
+    acls = [
+        acl.entry(acl.PROJECT_CONFIGS_READER, groups = ["all"]),
+        acl.entry(acl.LOGDOG_READER, groups = ["all"]),
+        acl.entry(acl.LOGDOG_WRITER, groups = ["luci-logdog-chromium-writers"]),
+        acl.entry(acl.SCHEDULER_READER, groups = ["all"]),
+        acl.entry(acl.SCHEDULER_OWNER, groups = ["project-libyuv-admins"]),
+        acl.entry(acl.BUILDBUCKET_READER, groups = ["all"]),
+        acl.entry(acl.BUILDBUCKET_OWNER, groups = ["project-libyuv-admins"]),
+    ],
+    bindings = [
+        luci.binding(
+            roles = "role/swarming.taskTriggerer",  # for LED tasks.
+            groups = "project-libyuv-admins",
+        ),
+        luci.binding(
+            roles = "role/configs.validator",
+            users = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
+        ),
+    ],
+)
+
+# Generates luci-logdog.cfg
+
+luci.logdog(
+    gs_bucket = "chromium-luci-logdog",
+)
+
+# Generates luci-scheduler.cfg
+
+luci.gitiles_poller(
+    name = "master-gitiles-trigger",
+    bucket = "ci",
+    repo = LIBYUV_GIT,
+)
+
+# Generates luci-milo.cfg
+
+luci.milo(
+    logo = "https://storage.googleapis.com/chrome-infra-public/logo/libyuv-logo.png",
+)
+
+def libyuv_ci_view(name, category, short_name):
+    return luci.console_view_entry(
+        console_view = "main",
+        builder = name,
+        category = category,
+        short_name = short_name,
+    )
+
+def libyuv_try_view(name):
+    return luci.list_view_entry(
+        list_view = "try",
+        builder = name,
+    )
+
+luci.console_view(
+    name = "main",
+    title = "libyuv Main Console",
+    include_experimental_builds = True,
+    repo = LIBYUV_GIT,
+)
+
+luci.list_view(
+    name = "cron",
+    title = "Cron",
+    entries = ["DEPS Autoroller"],
+)
+
+luci.list_view(
+    name = "try",
+    title = "libyuv Try Builders",
+)
+
+# Generates commit-queue.cfg
+
+def libyuv_try_job_verifier(name, cq_group, experiment_percentage):
+    return luci.cq_tryjob_verifier(
+        builder = name,
+        cq_group = cq_group,
+        experiment_percentage = experiment_percentage,
+    )
+
+luci.cq(
+    status_host = "chromium-cq-status.appspot.com",
+    submit_max_burst = 4,
+    submit_burst_delay = 8 * time.minute,
+)
+
+luci.cq_group(
+    name = "master",
+    watch = [
+        cq.refset(
+            repo = LIBYUV_GERRIT,
+            refs = ["refs/heads/main", "refs/heads/master"],
+        ),
+    ],
+    acls = [
+        acl.entry(acl.CQ_COMMITTER, groups = ["project-libyuv-committers"]),
+        acl.entry(acl.CQ_DRY_RUNNER, groups = ["project-libyuv-tryjob-access"]),
+    ],
+    retry_config = cq.RETRY_ALL_FAILURES,
+    cancel_stale_tryjobs = True,
+)
+
+luci.cq_group(
+    name = "config",
+    watch = [
+        cq.refset(
+            repo = LIBYUV_GERRIT,
+            refs = ["refs/heads/infra/config"],
+        ),
+    ],
+    acls = [
+        acl.entry(acl.CQ_COMMITTER, groups = ["project-libyuv-committers"]),
+        acl.entry(acl.CQ_DRY_RUNNER, groups = ["project-libyuv-tryjob-access"]),
+    ],
+    retry_config = cq.RETRY_ALL_FAILURES,
+    cancel_stale_tryjobs = True,
+)
+
+# Generates cr-buildbucket.cfg
+
+luci.bucket(
+    name = "ci",
+)
+luci.bucket(
+    name = "try",
+    acls = [
+        acl.entry(acl.BUILDBUCKET_TRIGGERER, groups = [
+            "project-libyuv-tryjob-access",
+            "service-account-cq",
+        ]),
+    ],
+)
+luci.bucket(
+    name = "cron",
+)
+
+def get_os_dimensions(os):
+    if os == "android":
+        return {"device_type": "walleye"}
+    if os == "ios" or os == "mac":
+        return {"os": "Mac-12", "cpu": "x86-64"}
+    elif os == "win":
+        return {"os": "Windows-10", "cores": "8", "cpu": "x86-64"}
+    elif os == "linux":
+        return {"os": "Ubuntu-18.04", "cores": "8", "cpu": "x86-64"}
+    return {}
+
+def libyuv_ci_builder(name, dimensions, properties, triggered_by):
+    return luci.builder(
+        name = name,
+        dimensions = dimensions,
+        properties = properties,
+        bucket = "ci",
+        service_account = "libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com",
+        triggered_by = triggered_by,
+        swarming_tags = ["vpython:native-python-wrapper"],
+        execution_timeout = 180 * time.minute,
+        build_numbers = True,
+        executable = luci.recipe(
+            name = "libyuv/libyuv",
+            cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+            use_python3 = True,
+        ),
+    )
+
+def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyuv"):
+    return luci.builder(
+        name = name,
+        dimensions = dimensions,
+        properties = properties,
+        bucket = "try",
+        service_account = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com",
+        swarming_tags = ["vpython:native-python-wrapper"],
+        execution_timeout = 180 * time.minute,
+        build_numbers = True,
+        executable = luci.recipe(
+            name = recipe_name,
+            cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+            use_python3 = True,
+        ),
+    )
+
+def ci_builder(name, os, category, short_name = None):
+    dimensions = get_os_dimensions(os)
+    properties = {"$build/reclient": RECLIENT_CI}
+
+    dimensions["pool"] = "luci.flex.ci"
+    properties["builder_group"] = "client.libyuv"
+
+    triggered_by = ["master-gitiles-trigger" if os != "android" else "Android Debug"]
+    libyuv_ci_view(name, category, short_name)
+    return libyuv_ci_builder(name, dimensions, properties, triggered_by)
+
+def try_builder(name, os, experiment_percentage = None):
+    dimensions = get_os_dimensions(os)
+    properties = {"$build/reclient": RECLIENT_CQ}
+
+    dimensions["pool"] = "luci.flex.try"
+    properties["builder_group"] = "tryserver.libyuv"
+
+    if name == "presubmit":
+        recipe_name = "run_presubmit"
+        properties["repo_name"] = "libyuv"
+        properties["runhooks"] = True
+        libyuv_try_job_verifier(name, "config", experiment_percentage)
+        return libyuv_try_builder(name, dimensions, properties, recipe_name)
+
+    libyuv_try_job_verifier(name, "master", experiment_percentage)
+    libyuv_try_view(name)
+    return libyuv_try_builder(name, dimensions, properties)
+
+luci.builder(
+    name = "DEPS Autoroller",
+    bucket = "cron",
+    service_account = "libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com",
+    dimensions = {
+        "pool": "luci.webrtc.cron",
+        "os": "Linux",
+        "cpu": "x86-64",
+    },
+    swarming_tags = ["vpython:native-python-wrapper"],
+    execution_timeout = 120 * time.minute,
+    build_numbers = True,
+    schedule = "0 14 * * *",  # Every 2 hours.
+    executable = luci.recipe(
+        name = "libyuv/roll_deps",
+        cipd_package = "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build",
+        use_python3 = True,
+    ),
+)
+
+ci_builder("Android ARM64 Debug", "linux", "Android|Builder", "dbg")
+ci_builder("Android Debug", "linux", "Android|Builder", "dbg")
+ci_builder("Android Release", "linux", "Android|Builder", "rel")
+ci_builder("Android32 x86 Debug", "linux", "Android|Builder|x86", "dbg")
+ci_builder("Android64 x64 Debug", "linux", "Android|Builder|x64", "dbg")
+ci_builder("Android Tester ARM32 Debug (Nexus 5X)", "android", "Android|Tester|ARM 32", "dbg")
+ci_builder("Android Tester ARM32 Release (Nexus 5X)", "android", "Android|Tester|ARM 32", "rel")
+ci_builder("Android Tester ARM64 Debug (Nexus 5X)", "android", "Android|Tester|ARM 64", "dbg")
+ci_builder("Linux Asan", "linux", "Linux", "asan")
+ci_builder("Linux MSan", "linux", "Linux", "msan")
+ci_builder("Linux Tsan v2", "linux", "Linux", "tsan")
+ci_builder("Linux UBSan", "linux", "Linux|UBSan")
+ci_builder("Linux UBSan vptr", "linux", "Linux|UBSan", "vptr")
+ci_builder("Linux32 Debug", "linux", "Linux|32", "dbg")
+ci_builder("Linux32 Release", "linux", "Linux|32", "rel")
+ci_builder("Linux64 Debug", "linux", "Linux|64", "dbg")
+ci_builder("Linux64 Release", "linux", "Linux|64", "rel")
+ci_builder("Mac Asan", "mac", "Mac", "asan")
+ci_builder("Mac64 Debug", "mac", "Mac", "dbg")
+ci_builder("Mac64 Release", "mac", "Mac", "rel")
+ci_builder("Win32 Debug", "win", "Win|32|Debug")
+ci_builder("Win32 Debug (Clang)", "win", "Win|32|Debug", "clg")
+ci_builder("Win32 Release", "win", "Win|32|Release")
+ci_builder("Win32 Release (Clang)", "win", "Win|32|Release", "clg")
+ci_builder("Win64 Debug", "win", "Win|64|Debug", "clg")
+ci_builder("Win64 Debug (Clang)", "win", "Win|64|Debug", "clg")
+ci_builder("Win64 Release", "win", "Win|64|Release")
+ci_builder("Win64 Release (Clang)", "win", "Win|64|Release", "clg")
+ci_builder("iOS ARM64 Debug", "ios", "iOS|ARM64", "dbg")
+ci_builder("iOS ARM64 Release", "ios", "iOS|ARM64", "rel")
+
+# TODO(crbug.com/1242847): make this not experimental.
+try_builder("android", "android", experiment_percentage = 100)
+try_builder("android_arm64", "android", experiment_percentage = 100)
+try_builder("android_rel", "android", experiment_percentage = 100)
+
+try_builder("android_x64", "linux")
+try_builder("android_x86", "linux")
+try_builder("ios_arm64", "ios")
+try_builder("ios_arm64_rel", "ios")
+try_builder("linux", "linux")
+try_builder("linux_asan", "linux")
+try_builder("linux_gcc", "linux", experiment_percentage = 100)
+try_builder("linux_msan", "linux")
+try_builder("linux_rel", "linux")
+try_builder("linux_tsan2", "linux")
+try_builder("linux_ubsan", "linux")
+try_builder("linux_ubsan_vptr", "linux")
+try_builder("mac", "mac")
+try_builder("mac_asan", "mac")
+try_builder("mac_rel", "mac")
+try_builder("win", "win")
+try_builder("win_clang", "win")
+try_builder("win_clang_rel", "win")
+try_builder("win_rel", "win")
+try_builder("win_x64_clang_rel", "win")
+try_builder("win_x64_rel", "win")
+try_builder("presubmit", "linux")
diff --git a/infra/config/project.cfg b/infra/config/project.cfg
new file mode 100644
index 00000000..3c327118
--- /dev/null
+++ b/infra/config/project.cfg
@@ -0,0 +1,15 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see ProjectCfg message:
+#   https://luci-config.appspot.com/schemas/projects:project.cfg
+
+name: "libyuv"
+access: "group:all"
+lucicfg {
+  version: "1.39.14"
+  package_dir: "."
+  config_dir: "."
+  entry_point: "main.star"
+  experiments: "crbug.com/1182002"
+}
diff --git a/infra/config/realms.cfg b/infra/config/realms.cfg
new file mode 100644
index 00000000..16ffaac9
--- /dev/null
+++ b/infra/config/realms.cfg
@@ -0,0 +1,83 @@
+# Auto-generated by lucicfg.
+# Do not modify manually.
+#
+# For the schema of this file, see RealmsCfg message:
+#   https://luci-config.appspot.com/schemas/projects:realms.cfg
+
+realms {
+  name: "@root"
+  bindings {
+    role: "role/buildbucket.owner"
+    principals: "group:project-libyuv-admins"
+  }
+  bindings {
+    role: "role/buildbucket.reader"
+    principals: "group:all"
+  }
+  bindings {
+    role: "role/configs.reader"
+    principals: "group:all"
+  }
+  bindings {
+    role: "role/configs.validator"
+    principals: "user:libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  bindings {
+    role: "role/logdog.reader"
+    principals: "group:all"
+  }
+  bindings {
+    role: "role/logdog.writer"
+    principals: "group:luci-logdog-chromium-writers"
+  }
+  bindings {
+    role: "role/scheduler.owner"
+    principals: "group:project-libyuv-admins"
+  }
+  bindings {
+    role: "role/scheduler.reader"
+    principals: "group:all"
+  }
+  bindings {
+    role: "role/swarming.taskTriggerer"
+    principals: "group:project-libyuv-admins"
+  }
+}
+realms {
+  name: "ci"
+  bindings {
+    role: "role/buildbucket.builderServiceAccount"
+    principals: "user:libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  bindings {
+    role: "role/scheduler.triggerer"
+    principals: "user:libyuv-ci-builder@chops-service-accounts.iam.gserviceaccount.com"
+    conditions {
+      restrict {
+        attribute: "scheduler.job.name"
+        values: "Android Tester ARM32 Debug (Nexus 5X)"
+        values: "Android Tester ARM32 Release (Nexus 5X)"
+        values: "Android Tester ARM64 Debug (Nexus 5X)"
+      }
+    }
+  }
+}
+realms {
+  name: "cron"
+  bindings {
+    role: "role/buildbucket.builderServiceAccount"
+    principals: "user:libyuv-ci-autoroll-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+}
+realms {
+  name: "try"
+  bindings {
+    role: "role/buildbucket.builderServiceAccount"
+    principals: "user:libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com"
+  }
+  bindings {
+    role: "role/buildbucket.triggerer"
+    principals: "group:project-libyuv-tryjob-access"
+    principals: "group:service-account-cq"
+  }
+}
diff --git a/files/libyuv.gni b/libyuv.gni
index 8df40ba2..343160c3 100644
--- a/files/libyuv.gni
+++ b/libyuv.gni
@@ -6,13 +6,15 @@
 # in the file PATENTS. All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
-import("//build_overrides/build.gni")
 import("//build/config/arm.gni")
+import("//build/config/loongarch64.gni")
 import("//build/config/mips.gni")
+import("//build_overrides/build.gni")
 
 declare_args() {
   libyuv_include_tests = !build_with_chromium
   libyuv_disable_jpeg = false
+  libyuv_disable_rvv = false
   libyuv_use_neon =
       current_cpu == "arm64" ||
       (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
@@ -20,4 +22,8 @@ declare_args() {
       (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
   libyuv_use_mmi =
       (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
+  libyuv_use_lsx =
+      (current_cpu == "loong64") && loongarch64_use_lsx
+  libyuv_use_lasx =
+      (current_cpu == "loong64") && loongarch64_use_lasx
 }
diff --git a/files/libyuv.gyp b/libyuv.gyp
index f73a1a4b..f73a1a4b 100644
--- a/files/libyuv.gyp
+++ b/libyuv.gyp
diff --git a/files/libyuv.gypi b/libyuv.gypi
index 18b2feca..48936aa7 100644
--- a/files/libyuv.gypi
+++ b/libyuv.gypi
@@ -27,7 +27,9 @@
       'include/libyuv/row.h',
       'include/libyuv/scale.h',
       'include/libyuv/scale_argb.h',
+      'include/libyuv/scale_rgb.h',
       'include/libyuv/scale_row.h',
+      'include/libyuv/scale_uv.h',
       'include/libyuv/version.h',
       'include/libyuv/video_common.h',
 
@@ -35,6 +37,7 @@
       'source/compare.cc',
       'source/compare_common.cc',
       'source/compare_gcc.cc',
+      'source/compare_msa.cc',
       'source/compare_neon.cc',
       'source/compare_neon64.cc',
       'source/compare_win.cc',
@@ -54,7 +57,6 @@
       'source/rotate_argb.cc',
       'source/rotate_common.cc',
       'source/rotate_gcc.cc',
-      'source/rotate_dspr2.cc',
       'source/rotate_msa.cc',
       'source/rotate_neon.cc',
       'source/rotate_neon64.cc',
@@ -62,7 +64,6 @@
       'source/row_any.cc',
       'source/row_common.cc',
       'source/row_gcc.cc',
-      'source/row_dspr2.cc',
       'source/row_msa.cc',
       'source/row_neon.cc',
       'source/row_neon64.cc',
@@ -72,10 +73,11 @@
       'source/scale_argb.cc',
       'source/scale_common.cc',
       'source/scale_gcc.cc',
-      'source/scale_dspr2.cc',
       'source/scale_msa.cc',
       'source/scale_neon.cc',
       'source/scale_neon64.cc',
+      'source/scale_rgb.cc',
+      'source/scale_uv.cc',
       'source/scale_win.cc',
       'source/video_common.cc',
     ],
diff --git a/files/linux.mk b/linux.mk
index e9a26a79..d19a888a 100644
--- a/files/linux.mk
+++ b/linux.mk
@@ -13,15 +13,14 @@ LOCAL_OBJ_FILES := \
 	source/compare.o           \
 	source/compare_common.o    \
 	source/compare_gcc.o       \
-	source/compare_mmi.o       \
 	source/compare_msa.o       \
-	source/compare_neon64.o    \
 	source/compare_neon.o      \
+	source/compare_neon64.o    \
 	source/compare_win.o       \
-	source/convert_argb.o      \
 	source/convert.o           \
-	source/convert_from_argb.o \
+	source/convert_argb.o      \
 	source/convert_from.o      \
+	source/convert_from_argb.o \
 	source/convert_jpeg.o      \
 	source/convert_to_argb.o   \
 	source/convert_to_i420.o   \
@@ -29,33 +28,38 @@ LOCAL_OBJ_FILES := \
 	source/mjpeg_decoder.o     \
 	source/mjpeg_validate.o    \
 	source/planar_functions.o  \
+	source/rotate.o            \
 	source/rotate_any.o        \
 	source/rotate_argb.o       \
-	source/rotate.o            \
 	source/rotate_common.o     \
 	source/rotate_gcc.o        \
-	source/rotate_mmi.o        \
+	source/rotate_lsx.o        \
 	source/rotate_msa.o        \
-	source/rotate_neon64.o     \
 	source/rotate_neon.o       \
+	source/rotate_neon64.o     \
 	source/rotate_win.o        \
 	source/row_any.o           \
 	source/row_common.o        \
 	source/row_gcc.o           \
-	source/row_mmi.o           \
+	source/row_lasx.o          \
+	source/row_lsx.o           \
 	source/row_msa.o           \
-	source/row_neon64.o        \
 	source/row_neon.o          \
+	source/row_neon64.o        \
+	source/row_rvv.o           \
 	source/row_win.o           \
+	source/scale.o             \
 	source/scale_any.o         \
 	source/scale_argb.o        \
-	source/scale.o             \
 	source/scale_common.o      \
 	source/scale_gcc.o         \
-	source/scale_mmi.o         \
+	source/scale_lsx.o         \
 	source/scale_msa.o         \
-	source/scale_neon64.o      \
 	source/scale_neon.o        \
+	source/scale_neon64.o      \
+	source/scale_rgb.o         \
+	source/scale_rvv.o         \
+	source/scale_uv.o          \
 	source/scale_win.o         \
 	source/video_common.o
 
@@ -65,7 +69,7 @@ LOCAL_OBJ_FILES := \
 .c.o:
 	$(CC) -c $(CFLAGS) $*.c -o $*.o
 
-all: libyuv.a yuvconvert cpuid psnr
+all: libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
 
 libyuv.a: $(LOCAL_OBJ_FILES)
 	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
@@ -74,10 +78,18 @@ libyuv.a: $(LOCAL_OBJ_FILES)
 yuvconvert: util/yuvconvert.cc libyuv.a
 	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a
 
+# A C test utility that generates yuvconstants for yuv to rgb.
+yuvconstants: util/yuvconstants.c libyuv.a
+	$(CXX) $(CXXFLAGS) -Iutil/ -lm -o $@ util/yuvconstants.c libyuv.a
+
 # A standalone test utility
 psnr: util/psnr.cc
 	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
 
+# A simple conversion example.
+i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a
+	$(CXX) $(CXXFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a
+
 # A C test utility that uses libyuv conversion from C.
 # gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0
 # CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk
@@ -85,4 +97,4 @@ cpuid: util/cpuid.c libyuv.a
 	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
 
 clean:
-	/bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr
+	/bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr
diff --git a/public.mk b/public.mk
index 259ece21..1342307a 100644
--- a/public.mk
+++ b/public.mk
@@ -3,7 +3,7 @@
 # Note that dependencies on NDK are not directly listed since NDK auto adds
 # them.
 
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/files/include
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
 
 LIBYUV_C_FLAGS :=
 
diff --git a/files/pylintrc b/pylintrc
index b8bea334..b8bea334 100644
--- a/files/pylintrc
+++ b/pylintrc
diff --git a/riscv_script/prepare_toolchain_qemu.sh b/riscv_script/prepare_toolchain_qemu.sh
new file mode 100755
index 00000000..2a901739
--- /dev/null
+++ b/riscv_script/prepare_toolchain_qemu.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -ev
+
+# Download & build RISC-V Clang toolchain & QEMU emulator. 
+# RISC-V Clang is for cross compile with  the RISC-V Vector ISA.
+# RISC-V QEMU is used to run the test suite.
+#
+# Requirements: Linux host w/ working C++ compiler, git, cmake, ninja, wget, tar
+
+# NOTE: this script must be run from the top-level directory of the LIBYUV_SRC_DIR.
+
+RISCV_TRIPLE="riscv64-unknown-linux-gnu"
+RISCV_QEMU="qemu-riscv64"
+
+LIBYUV_SRC_DIR=$(pwd)
+BUILD_DIR="$LIBYUV_SRC_DIR"/build-toolchain-qemu
+INSTALL_QEMU="$BUILD_DIR"/riscv-qemu
+INSTALL_CLANG="$BUILD_DIR"/riscv-clang
+
+LLVM_VERSION="16.0.0"
+LLVM_NAME=llvm-project-"$LLVM_VERSION".src
+
+RISCV_GNU_TOOLCHAIN="$BUILD_DIR"/riscv-gnu-toolchain
+RISCV_CLANG_TOOLCHAIN="$BUILD_DIR"/"$LLVM_NAME"
+
+QEMU_NAME="qemu-7.0.0"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Download and install RISC-V GNU Toolchain (needed to build Clang)
+if [ ! -d "$RISCV_GNU_TOOLCHAIN" ]
+then
+  git clone git@github.com:riscv/riscv-gnu-toolchain.git
+  pushd "$RISCV_GNU_TOOLCHAIN"
+  git submodule update --init --recursive
+  ./configure --with-cmodel=medany --prefix="$INSTALL_CLANG"
+  ionice nice make linux -j `nproc` install
+  popd
+fi
+
+# Download Clang toolchain & build cross compiler
+if [ ! -d "$RISCV_CLANG_TOOLCHAIN" ]
+then
+  wget https://github.com/llvm/llvm-project/releases/download/llvmorg-"$LLVM_VERSION"/"$LLVM_NAME".tar.xz
+  tar xvJf "$LLVM_NAME".tar.xz
+  pushd "$RISCV_CLANG_TOOLCHAIN"
+	cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_CLANG" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_TARGETS_TO_BUILD="RISCV" \
+      -DLLVM_ENABLE_PROJECTS="clang" \
+      -DLLVM_DEFAULT_TARGET_TRIPLE="$RISCV_TRIPLE" \
+      -DLLVM_INSTALL_TOOLCHAIN_ONLY=On \
+      -DDEFAULT_SYSROOT=../sysroot \
+      -G "Ninja" "$RISCV_CLANG_TOOLCHAIN"/llvm
+	ionice nice ninja -j `nproc`
+	ionice nice ninja -j `nproc` install
+  popd
+  pushd "$INSTALL_CLANG"/bin
+  ln -sf clang "$RISCV_TRIPLE"-clang
+  ln -sf clang++ "$RISCV_TRIPLE"-clang++
+  popd
+fi
+
+# Download QEMU and build the riscv64 Linux usermode emulator
+if [ ! -d "$QEMU_NAME" ]
+then
+  wget https://download.qemu.org/"$QEMU_NAME".tar.xz
+  tar xvJf "$QEMU_NAME".tar.xz
+  pushd "$QEMU_NAME"
+  ./configure --target-list=riscv64-linux-user --prefix="$INSTALL_QEMU"
+  ionice nice make -j `nproc` install
+  popd
+fi
diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake
new file mode 100644
index 00000000..e287941f
--- /dev/null
+++ b/riscv_script/riscv-clang.cmake
@@ -0,0 +1,55 @@
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "riscv64")
+
+option(USE_RVV "Enable riscv vector or not." ON)
+option(USE_AUTO_VECTORIZER "Enable riscv auto vectorizer or not." OFF)
+
+# Avoid to use system path for cross-compile
+set(CMAKE_FIND_USE_CMAKE_SYSTEM_PATH FALSE)
+
+set(TOOLCHAIN_PATH "" CACHE STRING "The toolcahin path.")
+if(NOT TOOLCHAIN_PATH)
+  set(TOOLCHAIN_PATH ${CMAKE_SOURCE_DIR}/build-toolchain-qemu/riscv-clang)
+endif()
+
+set(TOOLCHAIN_PREFIX "riscv64-unknown-linux-gnu-" CACHE STRING "The toolcahin prefix.")
+
+# toolchain setting
+set(CMAKE_C_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang")
+set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang++")
+
+# CMake will just use the host-side tools for the following tools, so we setup them here.
+set(CMAKE_C_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_CXX_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar")
+set(CMAKE_C_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib")
+set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump")
+set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy")
+
+# compile options
+set(RISCV_COMPILER_FLAGS "" CACHE STRING "Compile flags")
+# if user provides RISCV_COMPILER_FLAGS, appeding compile flags is avoided.
+if(RISCV_COMPILER_FLAGS STREQUAL "")
+  message(STATUS "USE_RVV: ${USE_RVV}")
+  message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}")
+  if(USE_RVV)
+    list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv")
+    if(NOT USE_AUTO_VECTORIZER)
+      # Disable auto-vectorizer
+      add_compile_options(-fno-vectorize -fno-slp-vectorize)
+    endif()
+  else()
+    list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc")
+  endif()
+endif()
+message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}")
+
+set(CMAKE_C_FLAGS             "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS           "${RISCV_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}")
+
+set(RISCV_LINKER_FLAGS "-lstdc++ -lpthread -lm -ldl")
+set(RISCV_LINKER_FLAGS_EXE)
+set(CMAKE_SHARED_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+set(CMAKE_MODULE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS    "${RISCV_LINKER_FLAGS} ${RISCV_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
diff --git a/riscv_script/run_qemu.sh b/riscv_script/run_qemu.sh
new file mode 100755
index 00000000..080af3b1
--- /dev/null
+++ b/riscv_script/run_qemu.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+set -e
+
+USE_RVV="${USE_RVV:-OFF}"
+TOOLCHAIN_PATH="${TOOLCHAIN_PATH:-../../build-toolchain-qemu/riscv-clang}"
+QEMU_PREFIX_PATH="${QEMU_PREFIX_PATH:-../../build-toolchain-qemu/riscv-qemu/}"
+
+if [ "${USE_RVV}" = "ON" ];then
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0 -L ${TOOLCHAIN_PATH}/sysroot"
+else
+  QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true -L ${TOOLCHAIN_PATH}/sysroot"
+fi
+
+$QEMU_PREFIX_PATH/bin/qemu-riscv64 $QEMU_OPTION $@
diff --git a/files/source/compare.cc b/source/compare.cc
index 5aa3a4db..50a736bd 100644
--- a/files/source/compare.cc
+++ b/source/compare.cc
@@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   }
 #endif
 
-  while (count >= (uint64_t)(kBlockSize)) {
+  while (count >= (uint64_t)kBlockSize) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
       return FOURCC_BGRA;
     }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+    if (argb[3] != 255) {  // Fourth byte is not Alpha of 255, so not BGRA.
       return FOURCC_ARGB;
     }
     if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
       return FOURCC_BGRA;
     }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+    if (argb[7] != 255) {  // Second pixel fourth byte is not Alpha of 255.
       return FOURCC_ARGB;
     }
     argb += 8;
@@ -154,11 +154,6 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
     HammingDistance = HammingDistance_MSA;
   }
 #endif
-#if defined(HAS_HAMMINGDISTANCE_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    HammingDistance = HammingDistance_MMI;
-  }
-#endif
 
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+ : diff)
@@ -216,11 +211,6 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
     SumSquareError = SumSquareError_MSA;
   }
 #endif
-#if defined(HAS_SUMSQUAREERROR_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SumSquareError = SumSquareError_MMI;
-  }
-#endif
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+ : sse)
 #endif
@@ -369,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
         (sum_a_sq + sum_b_sq + c1) *
         (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
-    if (ssim_d == 0.0) {
+    if (ssim_d == 0) {
       return DBL_MAX;
     }
-    return ssim_n * 1.0 / ssim_d;
+    return (double)ssim_n / (double)ssim_d;
   }
 }
 
diff --git a/files/source/compare_common.cc b/source/compare_common.cc
index d4b170ad..d1cab8d2 100644
--- a/files/source/compare_common.cc
+++ b/source/compare_common.cc
@@ -17,36 +17,6 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if ORIGINAL_OPT
-uint32_t HammingDistance_C1(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t diff = 0u;
-
-  int i;
-  for (i = 0; i < count; ++i) {
-    int x = src_a[i] ^ src_b[i];
-    if (x & 1)
-      ++diff;
-    if (x & 2)
-      ++diff;
-    if (x & 4)
-      ++diff;
-    if (x & 8)
-      ++diff;
-    if (x & 16)
-      ++diff;
-    if (x & 32)
-      ++diff;
-    if (x & 64)
-      ++diff;
-    if (x & 128)
-      ++diff;
-  }
-  return diff;
-}
-#endif
-
 // Hakmem method for hamming distance.
 uint32_t HammingDistance_C(const uint8_t* src_a,
                            const uint8_t* src_b,
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
new file mode 100644
index 00000000..33cbe25d
--- /dev/null
+++ b/source/compare_gcc.cc
@@ -0,0 +1,359 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint64_t diff = 0u;
+
+  asm volatile(
+      "xor         %3,%3                         \n"
+      "xor         %%r8,%%r8                     \n"
+      "xor         %%r9,%%r9                     \n"
+      "xor         %%r10,%%r10                   \n"
+
+      // Process 32 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov         (%0),%%rcx                    \n"
+      "mov         0x8(%0),%%rdx                 \n"
+      "xor         (%1),%%rcx                    \n"
+      "xor         0x8(%1),%%rdx                 \n"
+      "popcnt      %%rcx,%%rcx                   \n"
+      "popcnt      %%rdx,%%rdx                   \n"
+      "mov         0x10(%0),%%rsi                \n"
+      "mov         0x18(%0),%%rdi                \n"
+      "xor         0x10(%1),%%rsi                \n"
+      "xor         0x18(%1),%%rdi                \n"
+      "popcnt      %%rsi,%%rsi                   \n"
+      "popcnt      %%rdi,%%rdi                   \n"
+      "add         $0x20,%0                      \n"
+      "add         $0x20,%1                      \n"
+      "add         %%rcx,%3                      \n"
+      "add         %%rdx,%%r8                    \n"
+      "add         %%rsi,%%r9                    \n"
+      "add         %%rdi,%%r10                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+      "add         %%r8, %3                      \n"
+      "add         %%r9, %3                      \n"
+      "add         %%r10, %3                     \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=r"(diff)    // %3
+      :
+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+  return (uint32_t)(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      // Process 16 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov         (%0),%%ecx                    \n"
+      "mov         0x4(%0),%%edx                 \n"
+      "xor         (%1),%%ecx                    \n"
+      "xor         0x4(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "mov         0x8(%0),%%ecx                 \n"
+      "mov         0xc(%0),%%edx                 \n"
+      "xor         0x8(%1),%%ecx                 \n"
+      "xor         0xc(%1),%%edx                 \n"
+      "popcnt      %%ecx,%%ecx                   \n"
+      "add         %%ecx,%3                      \n"
+      "popcnt      %%edx,%%edx                   \n"
+      "add         %%edx,%3                      \n"
+      "add         $0x10,%0                      \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "+r"(diff)    // %3
+      :
+      : "memory", "cc", "ecx", "edx");
+
+  return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+                                 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "movdqa      %4,%%xmm2                     \n"
+      "movdqa      %5,%%xmm3                     \n"
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa      (%0),%%xmm4                   \n"
+      "movdqa      0x10(%0), %%xmm5              \n"
+      "pxor        (%0,%1), %%xmm4               \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pand        %%xmm2,%%xmm6                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm6,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "paddb       %%xmm7,%%xmm6                 \n"
+      "pxor        0x10(%0,%1),%%xmm5            \n"
+      "add         $0x20,%0                      \n"
+      "movdqa      %%xmm5,%%xmm4                 \n"
+      "pand        %%xmm2,%%xmm5                 \n"
+      "psrlw       $0x4,%%xmm4                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "pshufb      %%xmm5,%%xmm7                 \n"
+      "pand        %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufb      %%xmm4,%%xmm5                 \n"
+      "paddb       %%xmm7,%%xmm5                 \n"
+      "paddb       %%xmm5,%%xmm6                 \n"
+      "psadbw      %%xmm1,%%xmm6                 \n"
+      "paddd       %%xmm6,%%xmm0                 \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+      "pshufd      $0xaa,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0, %3                    \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+
+  return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "vbroadcastf128 %4,%%ymm2                  \n"
+      "vbroadcastf128 %5,%%ymm3                  \n"
+      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqa     (%0),%%ymm4                   \n"
+      "vmovdqa     0x20(%0), %%ymm5              \n"
+      "vpxor       (%0,%1), %%ymm4, %%ymm4       \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm6          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm6          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm4,%%ymm6,%%ymm6          \n"
+      "vpxor       0x20(%0,%1),%%ymm5,%%ymm4     \n"
+      "add         $0x40,%0                      \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm5          \n"
+      "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm5          \n"
+      "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
+      "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
+      "vpaddb      %%ymm5,%%ymm4,%%ymm4          \n"
+      "vpaddb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsadbw     %%ymm1,%%ymm4,%%ymm4          \n"
+      "vpaddd      %%ymm0,%%ymm4,%%ymm0          \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+
+      "vpermq      $0xb1,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
+      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovd       %%xmm0, %3                    \n"
+      "vzeroupper                                \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+  return diff;
+}
+#endif  // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "psubusb     %%xmm2,%%xmm1                 \n"
+      "psubusb     %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpckhbw   %%xmm5,%%xmm2                 \n"
+      "pmaddwd     %%xmm1,%%xmm1                 \n"
+      "pmaddwd     %%xmm2,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+
+      "pshufd      $0xee,%%xmm0,%%xmm1           \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "pshufd      $0x1,%%xmm0,%%xmm1            \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,%3                     \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+  return sse;
+}
+
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
+};
+
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash;
+  asm volatile(
+      "movd        %2,%%xmm0                     \n"
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "movdqa      %4,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmulld      %%xmm6,%%xmm0                 \n"
+      "movdqa      %5,%%xmm5                     \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm7,%%xmm3                 \n"
+      "pmulld      %%xmm5,%%xmm3                 \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpckhwd   %%xmm7,%%xmm4                 \n"
+      "pmulld      %%xmm5,%%xmm4                 \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "punpckhbw   %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"
+      "pmulld      %%xmm5,%%xmm2                 \n"
+      "movdqa      %8,%%xmm5                     \n"
+      "punpckhwd   %%xmm7,%%xmm1                 \n"
+      "pmulld      %%xmm5,%%xmm1                 \n"
+      "paddd       %%xmm4,%%xmm3                 \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm1                 \n"
+      "pshufd      $0xe,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "pshufd      $0x1,%%xmm1,%%xmm2            \n"
+      "paddd       %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"
+      "sub         $0x10,%1                      \n"
+      "jg          1b                            \n"
+      "movd        %%xmm0,%3                     \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/compare_msa.cc b/source/compare_msa.cc
index 0b807d37..0b807d37 100644
--- a/files/source/compare_msa.cc
+++ b/source/compare_msa.cc
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
new file mode 100644
index 00000000..afdd6012
--- /dev/null
+++ b/source/compare_neon.cc
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+
+  asm volatile(
+      "vmov.u16    q4, #0                        \n"  // accumulator
+
+      "1:                                        \n"
+      "vld1.8      {q0, q1}, [%0]!               \n"
+      "vld1.8      {q2, q3}, [%1]!               \n"
+      "veor.32     q0, q0, q2                    \n"
+      "veor.32     q1, q1, q3                    \n"
+      "vcnt.i8     q0, q0                        \n"
+      "vcnt.i8     q1, q1                        \n"
+      "subs        %2, %2, #32                   \n"
+      "vadd.u8     q0, q0, q1                    \n"  // 16 byte counts
+      "vpadal.u8   q4, q0                        \n"  // 8 shorts
+      "bgt         1b                            \n"
+
+      "vpaddl.u16  q0, q4                        \n"  // 4 ints
+      "vpadd.u32   d0, d0, d1                    \n"
+      "vpadd.u32   d0, d0, d0                    \n"
+      "vmov.32     %3, d0[0]                     \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "vmov.u8     q8, #0                        \n"
+      "vmov.u8     q10, #0                       \n"
+      "vmov.u8     q9, #0                        \n"
+      "vmov.u8     q11, #0                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "subs        %2, %2, #16                   \n"
+      "vsubl.u8    q2, d0, d2                    \n"
+      "vsubl.u8    q3, d1, d3                    \n"
+      "vmlal.s16   q8, d4, d4                    \n"
+      "vmlal.s16   q9, d6, d6                    \n"
+      "vmlal.s16   q10, d5, d5                   \n"
+      "vmlal.s16   q11, d7, d7                   \n"
+      "bgt         1b                            \n"
+
+      "vadd.u32    q8, q8, q9                    \n"
+      "vadd.u32    q10, q10, q11                 \n"
+      "vadd.u32    q11, q8, q10                  \n"
+      "vpaddl.u32  q1, q11                       \n"
+      "vadd.u64    d0, d2, d3                    \n"
+      "vmov.32     %3, d0[0]                     \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
new file mode 100644
index 00000000..70fb9b91
--- /dev/null
+++ b/source/compare_neon64.cc
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+  asm volatile(
+      "movi        v4.8h, #0                     \n"
+
+      "1:                                        \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"
+      "eor         v0.16b, v0.16b, v2.16b        \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "eor         v1.16b, v1.16b, v3.16b        \n"
+      "cnt         v0.16b, v0.16b                \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "cnt         v1.16b, v1.16b                \n"
+      "subs        %w2, %w2, #32                 \n"
+      "add         v0.16b, v0.16b, v1.16b        \n"
+      "uadalp      v4.8h, v0.16b                 \n"
+      "b.gt        1b                            \n"
+
+      "uaddlv      s4, v4.8h                     \n"
+      "fmov        %w3, s4                       \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "eor         v16.16b, v16.16b, v16.16b     \n"
+      "eor         v18.16b, v18.16b, v18.16b     \n"
+      "eor         v17.16b, v17.16b, v17.16b     \n"
+      "eor         v19.16b, v19.16b, v19.16b     \n"
+
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"
+      "ld1         {v1.16b}, [%1], #16           \n"
+      "subs        %w2, %w2, #16                 \n"
+      "usubl       v2.8h, v0.8b, v1.8b           \n"
+      "usubl2      v3.8h, v0.16b, v1.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "smlal       v16.4s, v2.4h, v2.4h          \n"
+      "smlal       v17.4s, v3.4h, v3.4h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "smlal2      v18.4s, v2.8h, v2.8h          \n"
+      "smlal2      v19.4s, v3.8h, v3.8h          \n"
+      "b.gt        1b                            \n"
+
+      "add         v16.4s, v16.4s, v17.4s        \n"
+      "add         v18.4s, v18.4s, v19.4s        \n"
+      "add         v19.4s, v16.4s, v18.4s        \n"
+      "addv        s0, v19.4s                    \n"
+      "fmov        %w3, s0                       \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/compare_win.cc b/source/compare_win.cc
index d57d3d9d..9bb27f1d 100644
--- a/files/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -22,8 +22,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                const uint8_t* src_b,
@@ -77,8 +78,7 @@ __declspec(naked) uint32_t
   }
 }
 
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_SUMSQUAREERROR_AVX2
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable : 4752)
 __declspec(naked) uint32_t
@@ -118,7 +118,7 @@ __declspec(naked) uint32_t
     ret
   }
 }
-#endif  // _MSC_VER >= 1700
+#endif  // HAS_SUMSQUAREERROR_AVX2
 
 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 uvec32 kHashMul0 = {
@@ -196,7 +196,7 @@ __declspec(naked) uint32_t
 }
 
 // Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_HASHDJB2_AVX2
 __declspec(naked) uint32_t
     HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
@@ -231,7 +231,7 @@ __declspec(naked) uint32_t
     ret
   }
 }
-#endif  // _MSC_VER >= 1700
+#endif  // HAS_HASHDJB2_AVX2
 
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
diff --git a/source/convert.cc b/source/convert.cc
new file mode 100644
index 00000000..6ac5bc43
--- /dev/null
+++ b/source/convert.cc
@@ -0,0 +1,4055 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"      // For ScalePlane()
+#include "libyuv/scale_row.h"  // For FixedDiv
+#include "libyuv/scale_uv.h"   // For UVScale()
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
+  if (src_uv_width <= 0 || src_uv_height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+                   dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                 dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                 dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
+}
+
+// Copy I420 with optional flipping.
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+static int Planar16bitTo8bit(const uint16_t* src_y,
+                             int src_stride_y,
+                             const uint16_t* src_u,
+                             int src_stride_u,
+                             const uint16_t* src_v,
+                             int src_stride_v,
+                             uint8_t* dst_y,
+                             int dst_stride_y,
+                             uint8_t* dst_u,
+                             int dst_stride_u,
+                             uint8_t* dst_v,
+                             int dst_stride_v,
+                             int width,
+                             int height,
+                             int subsample_x,
+                             int subsample_y,
+                             int depth) {
+  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  int scale = 1 << (24 - depth);
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    uv_height = -uv_height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width,
+                    uv_height);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width,
+                    uv_height);
+  return 0;
+}
+
+static int I41xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
+                          dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
+                          dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
+static int I21xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+    const int dy = FixedDiv(height, uv_height);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           1, 10);
+}
+
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
+}
+
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           0, 10);
+}
+
+LIBYUV_API
+int I410ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
+}
+
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+                           0, 10);
+}
+
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           1, 12);
+}
+
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           0, 12);
+}
+
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+                           0, 12);
+}
+
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
+// Any Ix10 To I010 format with mirroring.
+static int Ix10ToI010(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_u,
+                      int dst_stride_u,
+                      uint16_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y) {
+  const int dst_y_width = Abs(width);
+  const int dst_y_height = Abs(height);
+  const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                    dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                    dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
+}
+
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 0, 0);
+}
+
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 1, 0);
+}
+
+// Any I[420]1[02] to P[420]1[02] format with mirroring.
+static int IxxxToPxxx(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y,
+                      int depth) {
+  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+
+  ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                       depth);
+  MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                  dst_stride_uv, uv_width, uv_height, depth);
+  return 0;
+}
+
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 0, 10);
+}
+
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 0, 12);
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
+}
+
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    height);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    height);
+  return 0;
+}
+
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Allocate u and v buffers
+  align_buffer_64(plane_u, halfwidth * halfheight * 2);
+  uint8_t* plane_v = plane_u + halfwidth * halfheight;
+  if (!plane_u)
+    return 1;
+
+  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+             height);
+  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+               halfwidth, halfheight);
+  free_aligned_buffer_64(plane_u);
+  return 0;
+}
+
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_uv || !dst_uv || width <= 0) {
+    return -1;
+  }
+
+  int sign = height < 0 ? -1 : 1;
+
+  if (dst_y) {
+    DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+  }
+  DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1,
+              (height + sign) / 2, 16);
+
+  return 0;
+}
+
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int sign = height < 0 ? -1 : 1;
+
+  if (!src_uv || !dst_u || !dst_v || width <= 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+  }
+  DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16);
+
+  return 0;
+}
+
+LIBYUV_API
+int MM21ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  if (!src_y || !src_uv || !dst_yuy2 || width <= 0) {
+    return -1;
+  }
+
+  DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2,
+               dst_stride_yuy2, width, height, 32);
+
+  return 0;
+}
+
+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+LIBYUV_API
+int MT2TToP010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width <= 0 || !height || !src_uv || !dst_uv) {
+    return -1;
+  }
+
+  {
+    int uv_width = (width + 1) & ~1;
+    int uv_height = (height + 1) / 2;
+    int y = 0;
+    const int tile_width = 16;
+    const int y_tile_height = 32;
+    const int uv_tile_height = 16;
+    int padded_width = (width + tile_width - 1) & ~(tile_width - 1);
+    int y_tile_row_size = padded_width * y_tile_height * 10 / 8;
+    int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8;
+    size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t);
+    void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) =
+        UnpackMT2T_C;
+    align_buffer_64(row_buf, row_buf_size);
+    if (!row_buf)
+      return 1;
+
+#if defined(HAS_UNPACKMT2T_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      UnpackMT2T = UnpackMT2T_NEON;
+    }
+#endif
+    // Negative height means invert the image.
+    if (height < 0) {
+      height = -height;
+      uv_height = (height + 1) / 2;
+      if (dst_y) {
+        dst_y = dst_y + (height - 1) * dst_stride_y;
+        dst_stride_y = -dst_stride_y;
+      }
+      dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv;
+      dst_stride_uv = -dst_stride_uv;
+    }
+
+    // Unpack and detile Y in rows of tiles
+    if (src_y && dst_y) {
+      for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, y_tile_height, y_tile_height);
+        src_y += src_stride_y * y_tile_height;
+        dst_y += dst_stride_y * y_tile_height;
+      }
+      if (height & (y_tile_height - 1)) {
+        UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size);
+        DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y,
+                       width, height & (y_tile_height - 1), y_tile_height);
+      }
+    }
+
+    // Unpack and detile UV plane
+    for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_tile_height, uv_tile_height);
+      src_uv += src_stride_uv * uv_tile_height;
+      dst_uv += dst_stride_uv * uv_tile_height;
+    }
+    if (uv_height & (uv_tile_height - 1)) {
+      UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size);
+      DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv,
+                     uv_width, uv_height & (uv_tile_height - 1),
+                     uv_tile_height);
+    }
+    free_aligned_buffer_64(row_buf);
+  }
+  return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+  }
+  {
+    // Allocate 2 rows of vu.
+    int awidth = halfwidth * 2;
+    align_buffer_64(row_vu_0, awidth * 2);
+    uint8_t* row_vu_1 = row_vu_0 + awidth;
+    if (!row_vu_0)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+      MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+                 halfwidth);
+      InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+      src_u += src_stride_u * 2;
+      src_v += src_stride_v * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+    }
+    free_aligned_buffer_64(row_vu_0);
+  }
+  return 0;
+}
+#endif  // I422TONV21_ROW_VERSION
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
+}
+
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                   dst_stride_uv, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+  return 0;
+}
+
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Split UV plane - NV12 / NV21
+  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+               halfwidth, halfheight);
+
+  return 0;
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                    dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
+                    width, height);
+}
+
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int r;
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+              SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+              Abs(height), kFilterBilinear);
+  return r;
+}
+
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int r;
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+              dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return r;
+}
+
+// Any P[420]1[02] to I[420]1[02] format with mirroring.
+static int PxxxToIxxx(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_uv,
+                      int src_stride_uv,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_u,
+                      int dst_stride_u,
+                      uint16_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y,
+                      int depth) {
+  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                       depth);
+  SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, uv_width, uv_height, depth);
+  return 0;
+}
+
+LIBYUV_API
+int P010ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int P012ToI012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                    width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int r;
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+                 SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+                 Abs(height), kFilterBilinear);
+  return r;
+}
+
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int r;
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+                 dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return r;
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUVRow = YUY2ToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUVRow = YUY2ToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LASX;
+    YUY2ToUVRow = YUY2ToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_LASX;
+      YUY2ToUVRow = YUY2ToUVRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUVRow = UYVYToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUVRow = UYVYToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    UYVYToYRow = UYVYToYRow_Any_LASX;
+    UYVYToUVRow = UYVYToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_LASX;
+      UYVYToUVRow = UYVYToUVRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToUVRow = AYUVToUVRow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToUVRow = AYUVToUVRow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToUVRow = AYUVToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToUVRow = AYUVToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToVURow = AYUVToVURow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToVURow = AYUVToVURow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToVURow = AYUVToVURow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToVURow = AYUVToVURow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToVURow = AYUVToVURow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToVURow = AYUVToVURow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_vu += dst_stride_vu;
+  }
+  if (height & 1) {
+    AYUVToVURow(src_ayuv, 0, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+#ifdef USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+// The following version calls ARGBExtractAlpha on the full image.
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, width, height);
+  if (r == 0) {
+    r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width,
+                         height);
+  }
+  return r;
+}
+#else  // USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+                                               : ARGBExtractAlphaRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+                                                : ARGBExtractAlphaRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+                                                : ARGBExtractAlphaRow_Any_LSX;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+    ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
+                        width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    dst_a += dst_stride_a * 2;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBExtractAlphaRow(src_argb, dst_a, width);
+  }
+  return 0;
+}
+#endif  // USE_EXTRACTALPHA
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
+      BGRAToYRow_C;
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToYRow = BGRAToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToUVRow = BGRAToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    BGRAToYRow = BGRAToYRow_Any_LSX;
+    BGRAToUVRow = BGRAToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_LSX;
+      BGRAToUVRow = BGRAToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    BGRAToYRow = BGRAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BGRAToYRow = BGRAToYRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    ABGRToUVRow = ABGRToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+      ABGRToUVRow = ABGRToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
+      RGBAToYRow_C;
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGBAToYRow = RGBAToYRow_Any_LSX;
+    RGBAToUVRow = RGBAToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_LSX;
+      RGBAToUVRow = RGBAToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYRow = RGBAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYRow = RGBAToYRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Enabled if 1 pass is available
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+     defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV))
+#define HAS_RGB24TOYROW
+#endif
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW)
+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+#if defined(HAS_RGB24TOYROW)
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      RGB24ToUVRow = RGB24ToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_LSX;
+    RGB24ToYRow = RGB24ToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_LSX;
+      RGB24ToUVRow = RGB24ToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_LASX;
+    RGB24ToYRow = RGB24ToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYRow = RGB24ToYRow_LASX;
+      RGB24ToUVRow = RGB24ToUVRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYRow = RGB24ToYRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else  // HAS_RGB24TOYROW
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#endif  // HAS_RGB24TOYROW
+
+  {
+#if !defined(HAS_RGB24TOYROW)
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW)
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+#undef HAS_RGB24TOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+    defined(HAS_RGB24TOYJROW_RVV)
+#define HAS_RGB24TOYJROW
+#endif
+
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if defined(HAS_RGB24TOYJROW)
+  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVJRow_C;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYJRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+#if defined(HAS_RGB24TOYJROW)
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+      RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else  // HAS_RGB24TOYJROW
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
+#endif  // HAS_RGB24TOYJROW
+
+  {
+#if !defined(HAS_RGB24TOYJROW)
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYJROW)
+      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYJROW)
+      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYJROW)
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+#undef HAS_RGB24TOYJROW
+
+// Enabled if 1 pass is available
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+     defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV))
+#define HAS_RAWTOYROW
+#endif
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+#if defined(HAS_RAWTOYROW)
+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+                     uint8_t* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+#if defined(HAS_RAWTOYROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_NEON;
+      RAWToUVRow = RAWToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToUVRow = RAWToUVRow_Any_LSX;
+    RAWToYRow = RAWToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_LSX;
+      RAWToUVRow = RAWToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToUVRow = RAWToUVRow_Any_LASX;
+    RAWToYRow = RAWToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYRow = RAWToYRow_LASX;
+      RAWToUVRow = RAWToUVRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYRow = RAWToYRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else  // HAS_RAWTOYROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#endif  // HAS_RAWTOYROW
+
+  {
+#if !defined(HAS_RAWTOYROW)
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW)
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+#undef HAS_RAWTOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
+#define HAS_RAWTOYJROW
+#endif
+
+// Convert RAW to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+#if defined(HAS_RAWTOYJROW)
+  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RAWToUVJRow_C;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYJRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVJRow = RAWToUVJRow_Any_NEON;
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+      RAWToUVJRow = RAWToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVJRow = RAWToUVJRow_Any_MSA;
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+      RAWToUVJRow = RAWToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else  // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
+#endif  // HAS_RAWTOYJROW
+
+  {
+#if !defined(HAS_RAWTOYJROW)
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYJRow(src_raw, dst_y, width);
+      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYJRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYJROW)
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+#undef HAS_RAWTOYJROW
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
+  int y;
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \
+       defined(HAS_RGB565TOYROW_LASX))
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOYROW_LSX) && defined(HAS_RGB565TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_LSX;
+    RGB565ToYRow = RGB565ToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_LSX;
+      RGB565ToUVRow = RGB565ToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_LASX;
+    RGB565ToYRow = RGB565ToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB565ToYRow = RGB565ToYRow_LASX;
+      RGB565ToUVRow = RGB565ToUVRow_LASX;
+    }
+  }
+#endif
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#endif
+  {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+     defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+      defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
+  int y;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \
+       defined(HAS_ARGB1555TOYROW_LASX))
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_LSX;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_LASX;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_LASX;
+    }
+  }
+#endif
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#endif
+  {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size,
+                        width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+     defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+      defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size,
+                        width);
+      ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+      RGB24ToYJRow_C;
+  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToYJRow = RGB24ToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToYJRow = RGB24ToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToYJRow(src_rgb24, dst_yj, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_yj,
+              int dst_stride_yj,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+      RAWToYJRow_C;
+  if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_yj = 0;
+  }
+
+#if defined(HAS_RAWTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToYJRow = RAWToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RAWToYJRow = RAWToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToYJRow(src_raw, dst_yj, width);
+    src_raw += src_stride_raw;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
+  return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_y,
+                                dst_stride_y, dst_u, dst_stride_u, dst_v,
+                                dst_stride_v, width, height, kRotate0);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
new file mode 100644
index 00000000..871fea59
--- /dev/null
+++ b/source/convert_argb.cc
@@ -0,0 +1,8556 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include <assert.h>
+
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"  // For CopyPlane and ARGBShuffle.
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // For ScaleRowUp2_Linear and ScaleRowUp2_Bilinear
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
+  return 0;
+}
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I444ToARGBRow = I444ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to RGB24 with matrix.
+LIBYUV_API
+int I444ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to RGB24.
+LIBYUV_API
+int I444ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to RAW.
+LIBYUV_API
+int I444ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I444ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I212ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I212TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToAR30Row = I212ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I212TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I212ToAR30Row = I212ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I212ToAR30Row = I212ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYuv2020Constants, width, height);
+}
+
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToAR30Row = I410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToAR30Row = I410ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I212ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I212TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I212ToARGBRow = I212ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I212TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I212ToARGBRow = I212ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I212ToARGBRow = I212ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuv2020Constants, width, height);
+}
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvu2020Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToARGBRow = I410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToARGBRow = I410ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToARGBRow = P210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToARGBRow = P210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToAR30Row = P210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+                     int src_stride_y,
+                     const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*P210ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P210ToAR30Row = P210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+                          int src_stride_y,
+                          const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          const uint8_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I444AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
+}
+
+// Convert I422 with Alpha to ARGB.
+LIBYUV_API
+int I422AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
+}
+
+// Convert I422 with Alpha to ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I422AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
+}
+
+// Convert I444 with Alpha to ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
+}
+
+// Convert I444 with Alpha to ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I444AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
+}
+
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I210AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I210AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+                          int src_stride_y,
+                          const uint16_t* src_u,
+                          int src_stride_u,
+                          const uint16_t* src_v,
+                          int src_stride_v,
+                          const uint16_t* src_a,
+                          int src_stride_a,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width,
+                          int height,
+                          int attenuate) {
+  int y;
+  void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I410AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB with matrix.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I400ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I400ToARGBRow = I400ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I400ToARGBRow = I400ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    J400ToARGBRow = J400ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    J400ToARGBRow = J400ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+#ifndef __riscv
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+
+// Shuffle table for converting AR64 to AB64.
+static const uvec8 kShuffleMaskAR64ToAB64 = {
+    4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height);
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
+                     (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
+}
+#else
+// Convert BGRA to ARGB (same as ARGBToBGRA).
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBToBGRA(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, width,
+                    height);
+}
+
+// Convert ARGB to BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToBGRARow)(const uint8_t* src_argb, uint8_t* dst_bgra, int width) =
+      ARGBToBGRARow_C;
+  if (!src_argb || !dst_bgra || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_bgra == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_bgra = 0;
+  }
+
+#if defined(HAS_ARGBTOBGRAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToBGRARow = ARGBToBGRARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToBGRARow(src_argb, dst_bgra, width);
+    src_argb += src_stride_argb;
+    dst_bgra += dst_stride_bgra;
+  }
+  return 0;
+}
+
+// Convert ARGB to ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToABGRRow)(const uint8_t* src_argb, uint8_t* dst_abgr, int width) =
+      ARGBToABGRRow_C;
+  if (!src_argb || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_abgr = 0;
+  }
+
+#if defined(HAS_ARGBTOABGRROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToABGRRow = ARGBToABGRRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToABGRRow(src_argb, dst_abgr, width);
+    src_argb += src_stride_argb;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert ABGR to ARGB (same as ARGBToABGR).
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBToABGR(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, width,
+                    height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToARGBRow)(const uint8_t* src_rgba, uint8_t* dst_argb, int width) =
+      RGBAToARGBRow_C;
+  if (!src_rgba || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_rgba == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgba = dst_stride_argb = 0;
+  }
+
+#if defined(HAS_RGBATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToARGBRow = RGBAToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGBAToARGBRow(src_rgba, dst_argb, width);
+    src_rgba += src_stride_rgba;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*AR64ToAB64Row)(const uint16_t* src_ar64, uint16_t* dst_ab64,
+                        int width) = AR64ToAB64Row_C;
+  if (!src_ar64 || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_ab64 = 0;
+  }
+
+#if defined(HAS_AR64TOAB64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AR64ToAB64Row = AR64ToAB64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ToAB64Row(src_ar64, dst_ab64, width);
+    src_ar64 += src_stride_ar64;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+#endif
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToARGBRow = RAWToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToARGBRow = RAWToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToARGBRow = RAWToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToARGBRow = RAWToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_rgba,
+              int dst_stride_rgba,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+      RAWToRGBARow_C;
+  if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgba = 0;
+  }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGBARow = RAWToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGBARow = RAWToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGBARow = RAWToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGBARow = RAWToRGBARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToRGBARow(src_raw, dst_rgba, width);
+    src_raw += src_stride_raw;
+    dst_rgba += dst_stride_rgba;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_argb = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToARGBRow_C(src_ar30, dst_argb, width);
+    src_ar30 += src_stride_ar30;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_abgr = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+    src_ar30 += src_stride_ar30;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_ab30 = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+    src_ar30 += src_stride_ar30;
+    dst_ab30 += dst_stride_ab30;
+  }
+  return 0;
+}
+
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+               int src_stride_ar64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+                        int width) = AR64ToARGBRow_C;
+  if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_argb = 0;
+  }
+#if defined(HAS_AR64TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      AR64ToARGBRow = AR64ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_AR64TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ToARGBRow = AR64ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_AR64TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AR64ToARGBRow = AR64ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ToARGBRow = AR64ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_AR64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AR64ToARGBRow = AR64ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ToARGBRow(src_ar64, dst_argb, width);
+    src_ar64 += src_stride_ar64;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+               int src_stride_ab64,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+                        int width) = AB64ToARGBRow_C;
+  if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
+    src_stride_ab64 = -src_stride_ab64;
+  }
+  // Coalesce rows.
+  if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ab64 = dst_stride_argb = 0;
+  }
+#if defined(HAS_AB64TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      AB64ToARGBRow = AB64ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_AB64TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      AB64ToARGBRow = AB64ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_AB64TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AB64ToARGBRow = AB64ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      AB64ToARGBRow = AB64ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_AB64TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    AB64ToARGBRow = AB64ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AB64ToARGBRow(src_ab64, dst_argb, width);
+    src_ab64 += src_stride_ab64;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToARGBRow = NV12ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_vu,
+                     int src_stride_vu,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToARGBRow = NV21ToARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*NV12ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_vu,
+                      int src_stride_vu,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*NV21ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_vu,
+              int src_stride_vu,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+                           dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height) {
+  int y;
+  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+    dst_stride_yuv24 = -dst_stride_yuv24;
+  }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+    dst_yuv24 += dst_stride_yuv24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_LSX;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_LSX;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+static void WeavePixels(const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_uv,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_uv[0] = *src_u;
+    dst_uv[1] = *src_v;
+    dst_uv += 2;
+    src_u += src_pixel_stride_uv;
+    src_v += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height) {
+  int y;
+  uint8_t* dst_uv;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  // I420
+  if (src_pixel_stride_uv == 1) {
+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_argb, dst_stride_argb,
+                            yuvconstants, width, height);
+    // NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+    // NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+  }
+
+  // General case fallback creates NV12
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  if (!plane_uv)
+    return 1;
+  dst_uv = plane_uv;
+  for (y = 0; y < halfheight; ++y) {
+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += halfwidth * 2;
+  }
+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+                   dst_stride_argb, yuvconstants, width, height);
+  free_aligned_buffer_64(plane_uv);
+  return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_argb,
+                                dst_stride_argb, &kYuvI601Constants, width,
+                                height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                                src_stride_u, src_pixel_stride_uv, dst_abgr,
+                                dst_stride_abgr, &kYvuI601Constants, width,
+                                height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGBARow = I422ToRGBARow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_uv,
+                       int src_stride_uv,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                            dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+                            width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_rgba,
+                     int dst_stride_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGBARow = I422ToRGBARow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGBARow = I422ToRGBARow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuJPEGConstants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I422 to RGB24 with matrix.
+LIBYUV_API
+int I422ToRGB24Matrix(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_rgb24,
+                      int dst_stride_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width,
+                      int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB24Row = I422ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGB24.
+LIBYUV_API
+int I422ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to RAW.
+LIBYUV_API
+int I422ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I422ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB565Row = I422ToRGB565Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565 with specified color matrix.
+LIBYUV_API
+int I422ToRGB565Matrix(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToRGB565Row = I422ToRGB565Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_rgb565, dst_stride_rgb565,
+                            &kYuvI601Constants, width, height);
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    if (!row_argb)
+      return 1;
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                            width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     uint8_t* dst_ar30,
+                     int dst_stride_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
+                                    int src_stride_y,
+                                    const uint8_t* src_u,
+                                    int src_stride_u,
+                                    const uint8_t* src_v,
+                                    int src_stride_v,
+                                    uint8_t* dst_argb,
+                                    int dst_stride_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width,
+                                    int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444ToARGBRow = I444ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToARGBRow = I444ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
+  uint8_t* temp_u_1 = row;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
+  I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+  dst_argb += dst_stride_argb;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+    I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    I444ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
+    I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I422ToARGBMatrixLinear(const uint8_t* src_y,
+                                  int src_stride_y,
+                                  const uint8_t* src_u,
+                                  int src_stride_u,
+                                  const uint8_t* src_v,
+                                  int src_stride_v,
+                                  uint8_t* dst_argb,
+                                  int dst_stride_argb,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width,
+                                  int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444ToARGBRow = I444ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToARGBRow = I444ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToARGBRow = I444ToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
+  uint8_t* temp_u = row;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
+                                     int src_stride_y,
+                                     const uint8_t* src_u,
+                                     int src_stride_u,
+                                     const uint8_t* src_v,
+                                     int src_stride_v,
+                                     uint8_t* dst_rgb24,
+                                     int dst_stride_rgb24,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width,
+                                     int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
+  uint8_t* temp_u_1 = row;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
+  I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  dst_rgb24 += dst_stride_rgb24;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
+    I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
+                                    int src_stride_y,
+                                    const uint16_t* src_u,
+                                    int src_stride_u,
+                                    const uint16_t* src_v,
+                                    int src_stride_v,
+                                    uint8_t* dst_ar30,
+                                    int dst_stride_ar30,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width,
+                                    int height) {
+  int y;
+  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToAR30Row_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToAR30Row = I410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToAR30Row = I410ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+  uint16_t* temp_u_1 = (uint16_t*)(row);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+  I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+  dst_ar30 += dst_stride_ar30;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+    I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    I410ToAR30Row(src_y, temp_u_2, temp_v_2, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+    I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+
+  return 0;
+}
+
+static int I210ToAR30MatrixLinear(const uint16_t* src_y,
+                                  int src_stride_y,
+                                  const uint16_t* src_u,
+                                  int src_stride_u,
+                                  const uint16_t* src_v,
+                                  int src_stride_v,
+                                  uint8_t* dst_ar30,
+                                  int dst_stride_ar30,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width,
+                                  int height) {
+  int y;
+  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToAR30Row_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToAR30Row = I410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToAR30Row = I410ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+  uint16_t* temp_u = (uint16_t*)(row);
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
+    I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
+                                    int src_stride_y,
+                                    const uint16_t* src_u,
+                                    int src_stride_u,
+                                    const uint16_t* src_v,
+                                    int src_stride_v,
+                                    uint8_t* dst_argb,
+                                    int dst_stride_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width,
+                                    int height) {
+  int y;
+  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToARGBRow_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToARGBRow = I410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToARGBRow = I410ToARGBRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+  uint16_t* temp_u_1 = (uint16_t*)(row);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+  I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+  dst_argb += dst_stride_argb;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+    I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    I410ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+    I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I210ToARGBMatrixLinear(const uint16_t* src_y,
+                                  int src_stride_y,
+                                  const uint16_t* src_u,
+                                  int src_stride_u,
+                                  const uint16_t* src_v,
+                                  int src_stride_v,
+                                  uint8_t* dst_argb,
+                                  int dst_stride_argb,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width,
+                                  int height) {
+  int y;
+  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I410ToARGBRow_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410ToARGBRow = I410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410ToARGBRow = I410ToARGBRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+  uint16_t* temp_u = (uint16_t*)(row);
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear_12(src_u, temp_u, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v, width);
+    I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I420AlphaToARGBMatrixBilinear(
+    const uint8_t* src_y,
+    int src_stride_y,
+    const uint8_t* src_u,
+    int src_stride_u,
+    const uint8_t* src_v,
+    int src_stride_v,
+    const uint8_t* src_a,
+    int src_stride_a,
+    uint8_t* dst_argb,
+    int dst_stride_argb,
+    const struct YuvConstants* yuvconstants,
+    int width,
+    int height,
+    int attenuate) {
+  int y;
+  void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I444AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                               uint8_t* dst_ptr, ptrdiff_t dst_stride,
+                               int dst_width) = ScaleRowUp2_Bilinear_Any_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_BILINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_RVV;
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4);
+  uint8_t* temp_u_1 = row;
+  uint8_t* temp_u_2 = row + row_size;
+  uint8_t* temp_v_1 = row + row_size * 2;
+  uint8_t* temp_v_2 = row + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear(src_v, temp_v_1, width);
+  I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                     width);
+  if (attenuate) {
+    ARGBAttenuateRow(dst_argb, dst_argb, width);
+  }
+  dst_argb += dst_stride_argb;
+  src_y += src_stride_y;
+  src_a += src_stride_a;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width);
+    I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_a += src_stride_a;
+    I444AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    src_a += src_stride_a;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear(src_v, temp_v_1, width);
+    I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
+                                       int src_stride_y,
+                                       const uint8_t* src_u,
+                                       int src_stride_u,
+                                       const uint8_t* src_v,
+                                       int src_stride_v,
+                                       const uint8_t* src_a,
+                                       int src_stride_a,
+                                       uint8_t* dst_argb,
+                                       int dst_stride_argb,
+                                       const struct YuvConstants* yuvconstants,
+                                       int width,
+                                       int height,
+                                       int attenuate) {
+  int y;
+  void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I444AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444AlphaToARGBRow = I444AlphaToARGBRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
+  uint8_t* temp_u = row;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I010AlphaToARGBMatrixBilinear(
+    const uint16_t* src_y,
+    int src_stride_y,
+    const uint16_t* src_u,
+    int src_stride_u,
+    const uint16_t* src_v,
+    int src_stride_v,
+    const uint16_t* src_a,
+    int src_stride_a,
+    uint8_t* dst_argb,
+    int dst_stride_argb,
+    const struct YuvConstants* yuvconstants,
+    int width,
+    int height,
+    int attenuate) {
+  int y;
+  void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I410AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  void (*Scale2RowUp_Bilinear_12)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C;
+  void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                                int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON;
+    ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 4 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 4 * sizeof(uint16_t));
+  uint16_t* temp_u_1 = (uint16_t*)(row);
+  uint16_t* temp_u_2 = (uint16_t*)(row) + row_size;
+  uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2;
+  uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3;
+  if (!row)
+    return 1;
+
+  ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+  ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+  I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                     width);
+  if (attenuate) {
+    ARGBAttenuateRow(dst_argb, dst_argb, width);
+  }
+  dst_argb += dst_stride_argb;
+  src_y += src_stride_y;
+  src_a += src_stride_a;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width);
+    Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width);
+    I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_a += src_stride_a;
+    I410AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_a += src_stride_a;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  if (!(height & 1)) {
+    ScaleRowUp2_Linear_12(src_u, temp_u_1, width);
+    ScaleRowUp2_Linear_12(src_v, temp_v_1, width);
+    I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
+                                       int src_stride_y,
+                                       const uint16_t* src_u,
+                                       int src_stride_u,
+                                       const uint16_t* src_v,
+                                       int src_stride_v,
+                                       const uint16_t* src_a,
+                                       int src_stride_a,
+                                       uint8_t* dst_argb,
+                                       int dst_stride_argb,
+                                       const struct YuvConstants* yuvconstants,
+                                       int width,
+                                       int height,
+                                       int attenuate) {
+  int y;
+  void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                             const uint16_t* v_buf, const uint16_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I410AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+  uint16_t* temp_u = (uint16_t*)(row);
+  uint16_t* temp_v = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
+                                    int src_stride_y,
+                                    const uint16_t* src_uv,
+                                    int src_stride_uv,
+                                    uint8_t* dst_argb,
+                                    int dst_stride_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width,
+                                    int height) {
+  int y;
+  void (*P410ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToARGBRow = P410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P410ToARGBRow = P410ToARGBRow_AVX2;
+    }
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+  uint16_t* temp_uv_1 = (uint16_t*)(row);
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
+
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+  P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+  dst_argb += dst_stride_argb;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
+    P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    P410ToARGBRow(src_y, temp_uv_2, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+
+  if (!(height & 1)) {
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+    P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int P210ToARGBMatrixLinear(const uint16_t* src_y,
+                                  int src_stride_y,
+                                  const uint16_t* src_uv,
+                                  int src_stride_uv,
+                                  uint8_t* dst_argb,
+                                  int dst_stride_argb,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width,
+                                  int height) {
+  int y;
+  void (*P410ToARGBRow)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToARGBRow = P410ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P410ToARGBRow = P410ToARGBRow_AVX2;
+    }
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
+  uint16_t* temp_uv = (uint16_t*)(row);
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
+    P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
+                                    int src_stride_y,
+                                    const uint16_t* src_uv,
+                                    int src_stride_uv,
+                                    uint8_t* dst_ar30,
+                                    int dst_stride_ar30,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width,
+                                    int height) {
+  int y;
+  void (*P410ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+  void (*Scale2RowUp_Bilinear_16)(
+      const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
+      ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToAR30Row = P410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P410ToAR30Row = P410ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * 2 * sizeof(uint16_t));
+  uint16_t* temp_uv_1 = (uint16_t*)(row);
+  uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size;
+  if (!row)
+    return 1;
+
+  Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+  P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+  dst_ar30 += dst_stride_ar30;
+  src_y += src_stride_y;
+
+  for (y = 0; y < height - 2; y += 2) {
+    Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width);
+    P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    P410ToAR30Row(src_y, temp_uv_2, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+
+  if (!(height & 1)) {
+    Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width);
+    P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int P210ToAR30MatrixLinear(const uint16_t* src_y,
+                                  int src_stride_y,
+                                  const uint16_t* src_uv,
+                                  int src_stride_uv,
+                                  uint8_t* dst_ar30,
+                                  int dst_stride_ar30,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width,
+                                  int height) {
+  int y;
+  void (*P410ToAR30Row)(
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+  void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
+                             int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToAR30Row = P410ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      P410ToAR30Row = P410ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  const int row_size = (2 * width + 31) & ~31;
+  align_buffer_64(row, row_size * sizeof(uint16_t));
+  uint16_t* temp_uv = (uint16_t*)(row);
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_uv, temp_uv, width);
+    P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    src_uv += src_stride_uv;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
+                                   int src_stride_y,
+                                   const uint8_t* src_u,
+                                   int src_stride_u,
+                                   const uint8_t* src_v,
+                                   int src_stride_v,
+                                   uint8_t* dst_rgb24,
+                                   int dst_stride_rgb24,
+                                   const struct YuvConstants* yuvconstants,
+                                   int width,
+                                   int height) {
+  int y;
+  void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I444ToRGB24Row_C;
+  void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
+                             int dst_width) = ScaleRowUp2_Linear_Any_C;
+  assert(yuvconstants);
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I444TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToRGB24Row = I444ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I444ToRGB24Row = I444ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToRGB24Row = I444ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToRGB24Row = I444ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I444ToRGB24Row = I444ToRGB24Row_RVV;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp2_Linear = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  // alloc 2 lines temp
+  const int row_size = (width + 31) & ~31;
+  align_buffer_64(row, row_size * 2);
+  uint8_t* temp_u = row;
+  uint8_t* temp_v = row + row_size;
+  if (!row)
+    return 1;
+
+  for (y = 0; y < height; ++y) {
+    ScaleRowUp2_Linear(src_u, temp_u, width);
+    ScaleRowUp2_Linear(src_v, temp_v, width);
+    I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+
+  free_aligned_buffer_64(row);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I422ToRGB24MatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_argb, dst_stride_argb,
+                              yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+      return I420ToARGBMatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_argb, dst_stride_argb, yuvconstants, width, height);
+    case kFilterLinear:
+      // Actually we can do this, but probably there's no usage.
+      return -1;
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_argb, dst_stride_argb,
+                              yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I422ToARGBMatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_argb, dst_stride_argb, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I420ToRGB24MatrixFilter(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgb24,
+                            int dst_stride_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height,
+                            enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, dst_rgb24, dst_stride_rgb24,
+                               yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I420ToRGB24MatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_rgb24, dst_stride_rgb24, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_ar30, dst_stride_ar30,
+                              yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I010ToAR30MatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_ar30, dst_stride_ar30,
+                              yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I210ToAR30MatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_argb, dst_stride_argb,
+                              yuvconstants, width, height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I010ToARGBMatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_argb, dst_stride_argb, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_u,
+                           int src_stride_u,
+                           const uint16_t* src_v,
+                           int src_stride_v,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                              src_stride_v, dst_argb, dst_stride_argb,
+                              yuvconstants, width, height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I210ToARGBMatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+          dst_argb, dst_stride_argb, yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+                                   src_v, src_stride_v, src_a, src_stride_a,
+                                   dst_argb, dst_stride_argb, yuvconstants,
+                                   width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I420AlphaToARGBMatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+          src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+          attenuate);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+                                int src_stride_y,
+                                const uint8_t* src_u,
+                                int src_stride_u,
+                                const uint8_t* src_v,
+                                int src_stride_v,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+                                   src_v, src_stride_v, src_a, src_stride_a,
+                                   dst_argb, dst_stride_argb, yuvconstants,
+                                   width, height, attenuate);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I422AlphaToARGBMatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+          src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+          attenuate);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I010AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+                                   src_v, src_stride_v, src_a, src_stride_a,
+                                   dst_argb, dst_stride_argb, yuvconstants,
+                                   width, height, attenuate);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return I010AlphaToARGBMatrixBilinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+          src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+          attenuate);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+                                int src_stride_y,
+                                const uint16_t* src_u,
+                                int src_stride_u,
+                                const uint16_t* src_v,
+                                int src_stride_v,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width,
+                                int height,
+                                int attenuate,
+                                enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return I210AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+                                   src_v, src_stride_v, src_a, src_stride_a,
+                                   dst_argb, dst_stride_argb, yuvconstants,
+                                   width, height, attenuate);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return I210AlphaToARGBMatrixLinear(
+          src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+          src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+          attenuate);
+  }
+
+  return -1;
+}
+
+// TODO(fb): Verify this function works correctly.  P010 is like NV12 but 10 bit
+// UV is biplanar.
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                              dst_argb, dst_stride_argb, yuvconstants, width,
+                              height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv,
+                                      src_stride_uv, dst_argb, dst_stride_argb,
+                                      yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return P210ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                              dst_argb, dst_stride_argb, yuvconstants, width,
+                              height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return P210ToARGBMatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+                                    dst_argb, dst_stride_argb, yuvconstants,
+                                    width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                              dst_ar30, dst_stride_ar30, yuvconstants, width,
+                              height);
+    case kFilterLinear:  // TODO(fb): Implement Linear using Bilinear stride 0
+    case kFilterBilinear:
+    case kFilterBox:
+      return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv,
+                                      src_stride_uv, dst_ar30, dst_stride_ar30,
+                                      yuvconstants, width, height);
+  }
+
+  return -1;
+}
+
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+                           int src_stride_y,
+                           const uint16_t* src_uv,
+                           int src_stride_uv,
+                           uint8_t* dst_ar30,
+                           int dst_stride_ar30,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height,
+                           enum FilterMode filter) {
+  switch (filter) {
+    case kFilterNone:
+      return P210ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                              dst_ar30, dst_stride_ar30, yuvconstants, width,
+                              height);
+    case kFilterBilinear:
+    case kFilterBox:
+    case kFilterLinear:
+      return P210ToAR30MatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+                                    dst_ar30, dst_stride_ar30, yuvconstants,
+                                    width, height);
+  }
+
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/convert_from.cc b/source/convert_from.cc
new file mode 100644
index 00000000..e69da9e9
--- /dev/null
+++ b/source/convert_from.cc
@@ -0,0 +1,910 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
+static int I420ToI4xx(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  int r;
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
+    return -1;
+  }
+  if (dst_y) {
+    r = ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+                   dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                 dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                 dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return r;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    halfheight);
+  return 0;
+}
+
+// Convert 8 bit YUV to 12 bit.
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
+                    halfheight);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
+}
+
+// 420 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int r;
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+                    SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+                    Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+                    SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+                    Abs(height), kFilterBilinear);
+  return r;
+}
+
+// 422 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int r;
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                      Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+                    dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+                    dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return r;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int r;
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    r = ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                   Abs(width), Abs(height), kFilterBilinear);
+    if (r != 0) {
+      return r;
+    }
+  }
+  r = ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+                 dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+                 dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return r;
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_LASX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) / 2;
+  int halfheight = (height + 1) / 2;
+  if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+               halfwidth, halfheight);
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_AR30:
+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
+      break;
+    case FOURCC_NV12: {
+      int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_uv = dst_sample + dst_y_stride * height;
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
+      break;
+    }
+    case FOURCC_NV21: {
+      int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_vu = dst_sample + dst_y_stride * height;
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
+      break;
+    }
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YV12: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
+      }
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
+      }
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
+      } else {
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
+      }
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
+      break;
+    }
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/convert_from_argb.cc b/source/convert_from_argb.cc
index fbcd039d..b45de8c8 100644
--- a/files/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -76,11 +76,19 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOUV444ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_MMI;
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV444Row = ARGBToUV444Row_LASX;
     }
   }
 #endif
@@ -103,7 +111,7 @@ int ARGBToI444(const uint8_t* src_argb,
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -116,14 +124,27 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -170,30 +191,42 @@ int ARGBToI422(const uint8_t* src_argb,
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -206,40 +239,51 @@ int ARGBToI422(const uint8_t* src_argb,
     }
   }
 #endif
-
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_MSA;
     }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
-
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
@@ -279,74 +323,89 @@ int ARGBToNV12(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_MSA;
     }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -358,11 +417,19 @@ int ARGBToNV12(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -379,18 +446,25 @@ int ARGBToNV12(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow_ = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -439,30 +513,42 @@ int ARGBToNV21(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -475,39 +561,51 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_MSA;
     }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
     }
   }
 #endif
-
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -519,11 +617,19 @@ int ARGBToNV21(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -540,18 +646,25 @@ int ARGBToNV21(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow_ = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -599,30 +712,42 @@ int ABGRToNV12(const uint8_t* src_abgr,
     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
     ABGRToYRow = ABGRToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_SSSE3;
       ABGRToYRow = ABGRToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
     ABGRToYRow = ABGRToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ABGRToUVRow = ABGRToUVRow_AVX2;
       ABGRToYRow = ABGRToYRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ABGRToYRow = ABGRToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ABGRToYRow = ABGRToYRow_NEON;
     }
   }
@@ -635,38 +760,227 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
-#if defined(HAS_ABGRTOYROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ABGRToYRow = ABGRToYRow_Any_MSA;
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ABGRToYRow = ABGRToYRow_MSA;
     }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow_ = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_ABGRTOUVROW_MSA)
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
     ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
     if (IS_ALIGNED(width, 32)) {
       ABGRToUVRow = ABGRToUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ABGRTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToYRow = ABGRToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ABGRToYRow = ABGRToYRow_MMI;
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ABGRTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ABGRToUVRow = ABGRToUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_MMI;
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -678,11 +992,19 @@ int ABGRToNV12(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
@@ -699,31 +1021,38 @@ int ABGRToNV12(const uint8_t* src_abgr,
     }
   }
 #endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow_ = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow_ = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
     uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ABGRToYRow(src_abgr, dst_y, width);
       ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
       src_abgr += src_stride_abgr * 2;
       dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
+      dst_vu += dst_stride_vu;
     }
     if (height & 1) {
       ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
-      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ABGRToYRow(src_abgr, dst_y, width);
     }
     free_aligned_buffer_64(row_u);
@@ -764,30 +1093,42 @@ int ARGBToYUY2(const uint8_t* src_argb,
     height = 1;
     src_stride_argb = dst_stride_yuy2 = 0;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -800,38 +1141,51 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_MSA;
     }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -864,11 +1218,19 @@ int ARGBToYUY2(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToYUY2Row = I422ToYUY2Row_MMI;
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_LASX;
     }
   }
 #endif
@@ -878,6 +1240,8 @@ int ARGBToYUY2(const uint8_t* src_argb,
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+    if (!row_y)
+      return 1;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -925,30 +1289,42 @@ int ARGBToUYVY(const uint8_t* src_argb,
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
   }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
       ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -961,38 +1337,51 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYRow = ARGBToYRow_Any_MSA;
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_MSA;
     }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    ARGBToUVRow = ARGBToUVRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_MMI;
+      ARGBToYRow = ARGBToYRow_LSX;
+      ARGBToUVRow = ARGBToUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    ARGBToUVRow = ARGBToUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+      ARGBToUVRow = ARGBToUVRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -1025,11 +1414,19 @@ int ARGBToUYVY(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_I422TOUYVYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToUYVYRow = I422ToUYVYRow_MMI;
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_LASX;
     }
   }
 #endif
@@ -1039,6 +1436,8 @@ int ARGBToUYVY(const uint8_t* src_argb,
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+    if (!row_y)
+      return 1;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -1097,7 +1496,7 @@ int ARGBToI400(const uint8_t* src_argb,
 #if defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
   }
@@ -1110,14 +1509,27 @@ int ARGBToI400(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYRow = ARGBToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -1127,6 +1539,7 @@ int ARGBToI400(const uint8_t* src_argb,
   return 0;
 }
 
+#ifndef __riscv
 // Shuffle table for converting ARGB to RGBA.
 static const uvec8 kShuffleMaskARGBToRGBA = {
     3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
@@ -1142,6 +1555,47 @@ int ARGBToRGBA(const uint8_t* src_argb,
   return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
                      (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
 }
+#else
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToRGBARow)(const uint8_t* src_argb, uint8_t* dst_rgba, int width) =
+      ARGBToRGBARow_C;
+  if (!src_argb || !dst_rgba || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgba = 0;
+  }
+
+#if defined(HAS_ARGBTORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRGBARow = ARGBToRGBARow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGBARow(src_argb, dst_rgba, width);
+    src_argb += src_stride_argb;
+    dst_rgba += dst_stride_rgba;
+  }
+  return 0;
+}
+#endif
 
 // Convert ARGB To RGB24.
 LIBYUV_API
@@ -1195,7 +1649,7 @@ int ARGBToRGB24(const uint8_t* src_argb,
 #if defined(HAS_ARGBTORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToRGB24Row = ARGBToRGB24Row_NEON;
     }
   }
@@ -1208,14 +1662,27 @@ int ARGBToRGB24(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_LSX;
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -1282,14 +1749,27 @@ int ARGBToRAW(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTORAWROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRAWRow = ARGBToRAWRow_MMI;
+#if defined(HAS_ARGBTORAWROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_LSX;
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRAWRow = ARGBToRAWRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToRAWRow = ARGBToRAWRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1315,7 +1795,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
                        int height) {
   int y;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -1360,11 +1840,19 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
     }
   }
 #endif
@@ -1437,11 +1925,20 @@ int ARGBToRGB565(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTORGB565ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+#if defined(HAS_ARGBTORGB565ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_LSX;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTORGB565ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_LASX;
     }
   }
 #endif
@@ -1511,11 +2008,19 @@ int ARGBToARGB1555(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOARGB1555ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+#if defined(HAS_ARGBTOARGB1555ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_LASX;
     }
   }
 #endif
@@ -1585,11 +2090,19 @@ int ARGBToARGB4444(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOARGB4444ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+#if defined(HAS_ARGBTOARGB4444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_LASX;
     }
   }
 #endif
@@ -1706,19 +2219,19 @@ int ARGBToJ420(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1727,16 +2240,38 @@ int ARGBToJ420(const uint8_t* src_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
       ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1745,66 +2280,63 @@ int ARGBToJ420(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_NEON;
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYJRow = ARGBToYJRow_MSA;
     }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LSX;
+    ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_LSX;
+      ARGBToUVJRow = ARGBToUVJRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LASX;
+    ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_MSA;
+      ARGBToYJRow = ARGBToYJRow_LASX;
+      ARGBToUVJRow = ARGBToUVJRow_LASX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_MMI;
-    }
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
   }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
     src_argb += src_stride_argb * 2;
     dst_yj += dst_stride_yj * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   if (height & 1) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
   }
   return 0;
@@ -1816,19 +2348,19 @@ int ARGBToJ422(const uint8_t* src_argb,
                int src_stride_argb,
                uint8_t* dst_yj,
                int dst_stride_yj,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
       ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1839,21 +2371,27 @@ int ARGBToJ422(const uint8_t* src_argb,
   }
   // Coalesce rows.
   if (src_stride_argb == width * 4 && dst_stride_yj == width &&
-      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
     width *= height;
     height = 1;
-    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+    src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
   }
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
       ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1862,10 +2400,18 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
   }
@@ -1878,46 +2424,51 @@ int ARGBToJ422(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
       ARGBToYJRow = ARGBToYJRow_MSA;
     }
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LSX;
+    ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_LSX;
+      ARGBToUVJRow = ARGBToUVJRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LASX;
+    ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_MSA;
+      ARGBToYJRow = ARGBToYJRow_LASX;
+      ARGBToUVJRow = ARGBToUVJRow_LASX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_MMI;
-    }
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width);
     ARGBToYJRow(src_argb, dst_yj, width);
     src_argb += src_stride_argb;
     dst_yj += dst_stride_yj;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
   }
   return 0;
 }
@@ -1966,7 +2517,7 @@ int ARGBToJ400(const uint8_t* src_argb,
 #if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
   }
@@ -1979,12 +2530,9 @@ int ARGBToJ400(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
-    }
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
   }
 #endif
 
@@ -1996,6 +2544,798 @@ int ARGBToJ400(const uint8_t* src_argb,
   return 0;
 }
 
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  int y;
+  void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+      RGBAToYJRow_C;
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgba = dst_stride_yj = 0;
+  }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYJRow = RGBAToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYJRow = RGBAToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYJRow = RGBAToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGBAToYJRow = RGBAToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYJRow = RGBAToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYJRow = RGBAToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGBAToYJRow(src_rgba, dst_yj, width);
+    src_rgba += src_stride_rgba;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert ABGR to J420. (JPeg full range I420).
+LIBYUV_API
+int ABGRToJ420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_MSA;
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  if (height & 1) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to J422. (JPeg full range I422).
+LIBYUV_API
+int ABGRToJ422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width &&
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    ABGRToUVJRow = ABGRToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_MSA;
+    }
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
+    dst_yj += dst_stride_yj;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  return 0;
+}
+
+// Convert ABGR to J400.
+LIBYUV_API
+int ABGRToJ400(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_yj = 0;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYJRow = ABGRToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ar64,
+               int dst_stride_ar64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAR64Row_C;
+  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR64Row = ARGBToAR64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAR64Row = ARGBToAR64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAR64Row(src_argb, dst_ar64, width);
+    src_argb += src_stride_argb;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint16_t* dst_ab64,
+               int dst_stride_ab64,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+                        int width) = ARGBToAB64Row_C;
+  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ab64 = 0;
+  }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAB64Row = ARGBToAB64Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToAB64Row = ARGBToAB64Row_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToAB64Row(src_argb, dst_ab64, width);
+    src_argb += src_stride_argb;
+    dst_ab64 += dst_stride_ab64;
+  }
+  return 0;
+}
+
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
+    defined(HAS_RAWTOYJROW_RVV)
+#define HAS_RAWTOYJROW
+#endif
+
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+#if defined(HAS_RAWTOYJROW)
+  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+                      uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      RAWToUVJRow_C;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYJRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVJRow = RAWToUVJRow_Any_NEON;
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+      RAWToUVJRow = RAWToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVJRow = RAWToUVJRow_Any_MSA;
+    RAWToYJRow = RAWToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_MSA;
+      RAWToUVJRow = RAWToUVJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else  // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
+#endif  // HAS_RAWTOYJROW
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow_ = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
+#endif
+  {
+#if defined(HAS_RAWTOYJROW)
+    // Allocate a row of uv.
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+#else
+    // Allocate row of uv and 2 rows of ARGB.
+    const int row_size = ((width * 4 + 31) & ~31);
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+    uint8_t* row = row_vj + row_uv_size;
+#endif
+    if (!row_uj)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+      RAWToYJRow(src_raw, dst_y, width);
+      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
+      ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+      RAWToYJRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVJRow(row, 0, row_uj, row_vj, width);
+      MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+    free_aligned_buffer_64(row_uj);
+  }
+  return 0;
+}
+#undef HAS_RAWTOYJROW
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/files/source/convert_jpeg.cc b/source/convert_jpeg.cc
index f440c7c2..d7556ee9 100644
--- a/files/source/convert_jpeg.cc
+++ b/source/convert_jpeg.cc
@@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg,
   return ret ? 0 : 1;
 }
 
+static void JpegI420ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 with VU swapped.
+  I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+             dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
+  NV21Buffers* dest = (NV21Buffers*)(opaque);
+  // Use NV21 since there is no UV plane.
+  I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+             dest->vu_stride, dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    // Use NV21Buffers but with UV instead of VU.
+    NV21Buffers bufs = {dst_y,         dst_stride_y, dst_uv,
+                        dst_stride_uv, dst_width,    dst_height};
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+                                           dst_height);
+    } else {
+      // Unknown colorspace.
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
 struct ARGBBuffers {
   uint8_t* argb;
   int argb_stride;
diff --git a/files/source/convert_to_argb.cc b/source/convert_to_argb.cc
index bde1aa88..84df16c8 100644
--- a/files/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
@@ -32,9 +32,6 @@ extern "C" {
 // TODO(fbarchard): Add the following:
 // H010ToARGB
 // I010ToARGB
-// J400ToARGB
-// J422ToARGB
-// J444ToARGB
 
 LIBYUV_API
 int ConvertToARGB(const uint8_t* sample,
@@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample,
       r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
+    case FOURCC_J400:
+      src = sample + src_width * crop_y + crop_x;
+      r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
 
     // Biplanar formats
     case FOURCC_NV12:
@@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
       r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                      dst_stride_argb, crop_width, inv_crop_height);
       break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
-                     inv_crop_height);
-      break;
-
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
@@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_J420: {
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_H420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
@@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
-    case FOURCC_J420: {
+    case FOURCC_U420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
@@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample,
                              (halfwidth * crop_y + crop_x) / 2;
       const uint8_t* src_v = sample + src_width * abs_src_height +
                              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+      r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
@@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_J422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_H422: {
       int halfwidth = (src_width + 1) / 2;
       const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample,
       break;
     }
 
+    case FOURCC_U422: {
+      int halfwidth = (src_width + 1) / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_I444:
     case FOURCC_YV24: {
       const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_J444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_H444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_U444: {
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+      break;
+    }
+
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
       r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
diff --git a/files/source/convert_to_i420.cc b/source/convert_to_i420.cc
index 584be0ac..5869ecd7 100644
--- a/files/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
 
   switch (format) {
     // Single plane formats
-    case FOURCC_YUY2:
+    case FOURCC_YUY2: {  // TODO(fbarchard): Find better odd crop fix.
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
-    case FOURCC_UYVY:
+    }
+    case FOURCC_UYVY: {
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
+    }
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
       r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
@@ -179,11 +187,6 @@ int ConvertToI420(const uint8_t* sample,
                            dst_stride_y, dst_v, dst_stride_v, dst_u,
                            dst_stride_u, crop_width, inv_crop_height, rotation);
       break;
-    case FOURCC_M420:
-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                     dst_v, dst_stride_v, crop_width, inv_crop_height);
-      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
diff --git a/files/source/cpu_id.cc b/source/cpu_id.cc
index 48e2b615..eedce16b 100644
--- a/files/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -20,7 +20,7 @@
 #endif
 
 // For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
+#include <stdio.h>  // For fopen()
 #include <string.h>
 
 #ifdef __cplusplus
@@ -40,7 +40,6 @@ extern "C" {
 // cpu_info_ variable for SIMD instruction sets detected.
 LIBYUV_API int cpu_info_ = 0;
 
-// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
@@ -75,9 +74,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
   asm volatile(
 #if defined(__i386__) && defined(__PIC__)
       // Preserve ebx for fpic 32 bit.
-      "mov %%ebx, %%edi                          \n"
+      "mov         %%ebx, %%edi                  \n"
       "cpuid                                     \n"
-      "xchg %%edi, %%ebx                         \n"
+      "xchg        %%edi, %%ebx                  \n"
       : "=D"(info_ebx),
 #else
       "cpuid                                     \n"
@@ -108,14 +107,14 @@ void CpuId(int eax, int ecx, int* cpu_info) {
 //  }
 // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
 // https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
 #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
      defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
+static int GetXCR0() {
   int xcr0 = 0;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
@@ -129,21 +128,22 @@ int GetXCR0() {
 #define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
 #pragma optimize("g", on)
 #endif
 
-// based on libvpx arm_cpudetect.c
+// Based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
     // Assume Neon if /proc/cpuinfo is unavailable.
     // This will occur for Chrome sandbox for Pepper or Render process.
     return kCpuHasNEON;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
     if (memcmp(cpuinfo_line, "Features", 8) == 0) {
       char* p = strstr(cpuinfo_line, " neon");
       if (p && (p[5] == ' ' || p[5] == '\n')) {
@@ -162,47 +162,128 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-// TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
-                                       const char ase[]) {
+LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
   if (!f) {
-    // ase enabled if /proc/cpuinfo is unavailable.
-    if (strcmp(ase, " msa") == 0) {
-      return kCpuHasMSA;
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasRVV;
+#else
+    return 0;
+#endif
+  }
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+    if (memcmp(cpuinfo_line, "isa", 3) == 0) {
+      // ISA string must begin with rv64{i,e,g} for a 64-bit processor.
+      char* isa = strstr(cpuinfo_line, "rv64");
+      if (isa) {
+        size_t isa_len = strlen(isa);
+        char* extensions;
+        size_t extensions_len = 0;
+        size_t std_isa_len;
+        // Remove the new-line character at the end of string
+        if (isa[isa_len - 1] == '\n') {
+          isa[--isa_len] = '\0';
+        }
+        // 5 ISA characters
+        if (isa_len < 5) {
+          fclose(f);
+          return 0;
+        }
+        // Skip {i,e,g} canonical checking.
+        // Skip rvxxx
+        isa += 5;
+        // Find the very first occurrence of 's', 'x' or 'z'.
+        // To detect multi-letter standard, non-standard, and
+        // supervisor-level extensions.
+        extensions = strpbrk(isa, "zxs");
+        if (extensions) {
+          // Multi-letter extensions are seperated by a single underscore
+          // as described in RISC-V User-Level ISA V2.2.
+          char* ext = strtok(extensions, "_");
+          extensions_len = strlen(extensions);
+          while (ext) {
+            // Search for the ZVFH (Vector FP16) extension.
+            if (!strcmp(ext, "zvfh")) {
+              flag |= kCpuHasRVVZVFH;
+            }
+            ext = strtok(NULL, "_");
+          }
+        }
+        std_isa_len = isa_len - extensions_len - 5;
+        // Detect the v in the standard single-letter extensions.
+        if (memchr(isa, 'v', std_isa_len)) {
+          // The RVV implied the F extension.
+          flag |= kCpuHasRVV;
+        }
+      }
     }
-    if (strcmp(ase, " mmi") == 0) {
-      return kCpuHasMMI;
+#if defined(__riscv_vector)
+    // Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
+    else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
+             (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
+      fclose(f);
+      return kCpuHasRVV;
     }
+#endif
+  }
+  fclose(f);
+  return flag;
+}
+
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  int flag = 0;
+  FILE* f = fopen(cpuinfo_name, "re");
+  if (!f) {
+    // Assume nothing if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
     return 0;
   }
-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
-      char* p = strstr(cpuinfo_line, ase);
-      if (p) {
-        fclose(f);
-        if (strcmp(ase, " msa") == 0) {
-          return kCpuHasMSA;
-        }
-        return 0;
+  memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+      // Workaround early kernel without MSA in ASEs line.
+      if (strstr(cpuinfo_line, "Loongson-2K")) {
+        flag |= kCpuHasMSA;
       }
-    } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
-      char* p = strstr(cpuinfo_line, "Loongson-3");
-      if (p) {
-        fclose(f);
-        if (strcmp(ase, " mmi") == 0) {
-          return kCpuHasMMI;
-        }
-        return 0;
+    }
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      if (strstr(cpuinfo_line, "msa")) {
+        flag |= kCpuHasMSA;
       }
+      // ASEs is the last line, so we can break here.
+      break;
     }
   }
   fclose(f);
-  return 0;
+  return flag;
 }
 
+#define LOONGARCH_CFG2 0x2
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if defined(__loongarch__)
+LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
+  int flag = 0;
+  uint32_t cfg2 = 0;
+
+  __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
+
+  if (cfg2 & LOONGARCH_CFG2_LSX)
+    flag |= kCpuHasLSX;
+
+  if (cfg2 & LOONGARCH_CFG2_LASX)
+    flag |= kCpuHasLASX;
+  return flag;
+}
+#endif
+
 static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info = 0;
 #if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
@@ -211,10 +292,12 @@ static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info0[4] = {0, 0, 0, 0};
   int cpu_info1[4] = {0, 0, 0, 0};
   int cpu_info7[4] = {0, 0, 0, 0};
+  int cpu_einfo7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
+    CpuId(7, 1, cpu_einfo7);
   }
   cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
@@ -227,7 +310,9 @@ static SAFEBUFFERS int GetCpuFlags(void) {
       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
     cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
                 ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
+                ((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
+                ((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
 
     // Detect AVX512bw
     if ((GetXCR0() & 0xe0) == 0xe0) {
@@ -235,20 +320,20 @@ static SAFEBUFFERS int GetCpuFlags(void) {
       cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
       cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
       cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
       cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
-      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
-      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+      cpu_info |= (cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0;
     }
   }
 #endif
 #if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
-  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#elif defined(_MIPS_ARCH_LOONGSON3A)
-  cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi");
-#endif
+  cpu_info = MipsCpuCaps("/proc/cpuinfo");
   cpu_info |= kCpuHasMIPS;
 #endif
+#if defined(__loongarch__) && defined(__linux__)
+  cpu_info = LoongarchCpuCaps();
+  cpu_info |= kCpuHasLOONGARCH;
+#endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
@@ -267,6 +352,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
 #endif
   cpu_info |= kCpuHasARM;
 #endif  // __arm__
+#if defined(__riscv) && defined(__linux__)
+  cpu_info = RiscvCpuCaps("/proc/cpuinfo");
+  cpu_info |= kCpuHasRISCV;
+#endif  // __riscv
   cpu_info |= kCpuInitialized;
   return cpu_info;
 }
diff --git a/files/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc
index 5c5e5ead..0141da8a 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   }
 
   buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)src_len;
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -417,7 +417,6 @@ void init_source(j_decompress_ptr cinfo) {
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
   BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
-    assert(0 && "No more data");
     // ERROR: No more data
     return FALSE;
   }
@@ -429,8 +428,8 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
   jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
-  if(bytes > src->bytes_in_buffer) {
+  size_t bytes = (size_t)num_bytes;
+  if (bytes > src->bytes_in_buffer) {
     src->next_input_byte = nullptr;
     src->bytes_in_buffer = 0;
   } else {
diff --git a/files/source/mjpeg_validate.cc b/source/mjpeg_validate.cc
index ba0a03ab..ba0a03ab 100644
--- a/files/source/mjpeg_validate.cc
+++ b/source/mjpeg_validate.cc
diff --git a/files/source/planar_functions.cc b/source/planar_functions.cc
index 9cab230f..1c94e260 100644
--- a/files/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/planar_functions.h"
 
+#include <assert.h>
 #include <string.h>  // for memset()
 
 #include "libyuv/cpu_id.h"
@@ -34,6 +35,9 @@ void CopyPlane(const uint8_t* src_y,
                int height) {
   int y;
   void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -71,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -80,8 +89,6 @@ void CopyPlane(const uint8_t* src_y,
   }
 }
 
-// TODO(fbarchard): Consider support for negative height.
-// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
 void CopyPlane_16(const uint16_t* src_y,
                   int src_stride_y,
@@ -89,36 +96,8 @@ void CopyPlane_16(const uint16_t* src_y,
                   int dst_stride_y,
                   int width,
                   int height) {
-  int y;
-  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
-  // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-#if defined(HAS_COPYROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_16_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_16_NEON;
-  }
-#endif
-
-  // Copy plane
-  for (y = 0; y < height; ++y) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
+  CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y,
+            dst_stride_y * 2, width * 2, height);
 }
 
 // Convert a plane of 16 bit data to 8 bit
@@ -134,6 +113,9 @@ void Convert16To8Plane(const uint16_t* src_y,
   void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
                           int width) = Convert16To8Row_C;
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -146,6 +128,14 @@ void Convert16To8Plane(const uint16_t* src_y,
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
+#if defined(HAS_CONVERT16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Convert16To8Row = Convert16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_NEON;
+    }
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Convert16To8Row = Convert16To8Row_Any_SSSE3;
@@ -177,13 +167,16 @@ void Convert8To16Plane(const uint8_t* src_y,
                        int src_stride_y,
                        uint16_t* dst_y,
                        int dst_stride_y,
-                       int scale,  // 16384 for 10 bits
+                       int scale,  // 1024 for 10 bits
                        int width,
                        int height) {
   int y;
   void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
                           int width) = Convert8To16Row_C;
 
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -238,9 +231,12 @@ int I422Copy(const uint8_t* src_y,
              int width,
              int height) {
   int halfwidth = (width + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -276,7 +272,8 @@ int I444Copy(const uint8_t* src_y,
              int dst_stride_v,
              int width,
              int height) {
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -298,6 +295,88 @@ int I444Copy(const uint8_t* src_y,
   return 0;
 }
 
+// Copy I210.
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I410.
+LIBYUV_API
+int I410Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
 // Copy I400.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
@@ -349,6 +428,56 @@ int I420ToI400(const uint8_t* src_y,
   return 0;
 }
 
+// Copy NV12. Supports inverting.
+LIBYUV_API
+int NV12Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_uv,
+             int src_stride_uv,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_uv,
+             int dst_stride_uv,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+            halfheight);
+  return 0;
+}
+
+// Copy NV21. Supports inverting.
+LIBYUV_API
+int NV21Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_vu,
+             int src_stride_vu,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_vu,
+             int dst_stride_vu,
+             int width,
+             int height) {
+  return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+                  dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
 // Support function for NV12 etc UV channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -363,6 +492,9 @@ void SplitUVPlane(const uint8_t* src_uv,
   int y;
   void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -410,14 +542,19 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SplitUVRow = SplitUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
@@ -440,6 +577,9 @@ void MergeUVPlane(const uint8_t* src_u,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -464,11 +604,19 @@ void MergeUVPlane(const uint8_t* src_u,
 #if defined(HAS_MERGEUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
+    if (IS_ALIGNED(width, 16)) {
       MergeUVRow = MergeUVRow_AVX2;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow = MergeUVRow_Any_NEON;
@@ -485,14 +633,19 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
-#if defined(HAS_MERGEUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeUVRow = MergeUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MergeUVRow = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -503,6 +656,289 @@ void MergeUVPlane(const uint8_t* src_u,
   }
 }
 
+// Support function for P010 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_u,
+                     int dst_stride_u,
+                     uint16_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     int depth) {
+  int y;
+  void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
+                        uint16_t* dst_v, int depth, int width) =
+      SplitUVRow_16_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow_16 = SplitUVRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow_16 = SplitUVRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of UV.
+    SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+}
+
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height,
+                     int depth) {
+  int y;
+  void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
+                        uint16_t* dst_uv, int depth, int width) =
+      MergeUVRow_16_C;
+  assert(depth >= 8);
+  assert(depth <= 16);
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_u == width && src_stride_v == width &&
+      dst_stride_uv == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      MergeUVRow_16 = MergeUVRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_16 = MergeUVRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeUVRow_16 = MergeUVRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+}
+
+// Convert plane from lsb to msb
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth) {
+  int y;
+  int scale = 1 << (16 - depth);
+  void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+                         int width) = MultiplyRow_16_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+
+#if defined(HAS_MULTIPLYROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MultiplyRow_16 = MultiplyRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MULTIPLYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MultiplyRow_16 = MultiplyRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MultiplyRow_16 = MultiplyRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MultiplyRow_16(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert plane from msb to lsb
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth) {
+  int y;
+  int scale = 1 << depth;
+  void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+                    int width) = DivideRow_16_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+
+#if defined(HAS_DIVIDEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    DivideRow = DivideRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      DivideRow = DivideRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_DIVIDEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DivideRow = DivideRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DivideRow = DivideRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    DivideRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height) {
+  int y;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_vu = 0;
+  }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SwapUVRow = SwapUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SwapUVRow = SwapUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SwapUVRow = SwapUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SwapUVRow(src_uv, dst_vu, width);
+    src_uv += src_stride_uv;
+    dst_vu += dst_stride_vu;
+  }
+}
+
 // Convert NV21 to NV12.
 LIBYUV_API
 int NV21ToNV12(const uint8_t* src_y,
@@ -515,51 +951,286 @@ int NV21ToNV12(const uint8_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  int y;
-  void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
-      UVToVURow_C;
-
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
+
   if (!src_vu || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
     src_vu = src_vu + (halfheight - 1) * src_stride_vu;
-    src_stride_y = -src_stride_y;
     src_stride_vu = -src_stride_vu;
   }
-  // Coalesce rows.
-  if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_vu = dst_stride_uv = 0;
+
+  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+              halfheight);
+  return 0;
+}
+
+// Test if tile_height is a power of 2 (16 or 32)
+#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
+
+// Detile a plane of data
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MM21.
+// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
+// TODO: More detile row functions.
+LIBYUV_API
+int DetilePlane(const uint8_t* src_y,
+                int src_stride_y,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                int width,
+                int height,
+                int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
+                    int width) = DetileRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
   }
 
-#if defined(HAS_UVToVUROW_NEON)
+#if defined(HAS_DETILEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileRow = DetileRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow = DetileRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    UVToVURow = UVToVURow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      UVToVURow = UVToVURow_NEON;
+    DetileRow = DetileRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow = DetileRow_NEON;
     }
   }
 #endif
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileRow(src_y, src_tile_stride, dst_y, width);
+    dst_y += dst_stride_y;
+    src_y += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+    }
+  }
+  return 0;
+}
+
+// Convert a plane of 16 bit tiles of 16 x H to linear.
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MT2T.
+LIBYUV_API
+int DetilePlane_16(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   int width,
+                   int height,
+                   int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
+                       uint16_t* dst, int width) = DetileRow_16_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
+      !IS_POWEROFTWO(tile_height)) {
+    return -1;
   }
 
-  for (y = 0; y < halfheight; ++y) {
-    UVToVURow(src_vu, dst_uv, halfwidth);
-    src_vu += src_stride_vu;
-    dst_uv += dst_stride_uv;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+#if defined(HAS_DETILEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileRow_16 = DetileRow_16_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    DetileRow_16 = DetileRow_16_Any_AVX;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_AVX;
+    }
+  }
+#endif
+#if defined(HAS_DETILEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileRow_16 = DetileRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow_16 = DetileRow_16_NEON;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileRow_16(src_y, src_tile_stride, dst_y, width);
+    dst_y += dst_stride_y;
+    src_y += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+    }
   }
   return 0;
 }
 
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+                        int src_stride_uv,
+                        uint8_t* dst_u,
+                        int dst_stride_u,
+                        uint8_t* dst_v,
+                        int dst_stride_v,
+                        int width,
+                        int height,
+                        int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u, uint8_t* dst_v, int width) =
+      DetileSplitUVRow_C;
+  assert(src_stride_uv >= 0);
+  assert(tile_height > 0);
+  assert(src_stride_uv > 0);
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_stride_u = -dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_v = -dst_stride_v;
+  }
+
+#if defined(HAS_DETILESPLITUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      DetileSplitUVRow = DetileSplitUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_DETILESPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileSplitUVRow = DetileSplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileSplitUVRow = DetileSplitUVRow_NEON;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height;
+    }
+  }
+}
+
+LIBYUV_API
+void DetileToYUY2(const uint8_t* src_y,
+                  int src_stride_y,
+                  const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_yuy2,
+                  int dst_stride_yuy2,
+                  int width,
+                  int height,
+                  int tile_height) {
+  const ptrdiff_t src_y_tile_stride = 16 * tile_height;
+  const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2;
+  int y;
+  void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2, int width) = DetileToYUY2_C;
+  assert(src_stride_y >= 0);
+  assert(src_stride_y > 0);
+  assert(src_stride_uv >= 0);
+  assert(src_stride_uv > 0);
+  assert(tile_height > 0);
+
+  if (width <= 0 || height == 0 || tile_height <= 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+
+#if defined(HAS_DETILETOYUY2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileToYUY2 = DetileToYUY2_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_DETILETOYUY2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileToYUY2 = DetileToYUY2_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileToYUY2 = DetileToYUY2_SSE2;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2,
+                 width);
+    dst_yuy2 += dst_stride_yuy2;
+    src_y += 16;
+
+    if (y & 0x1)
+      src_uv += 16;
+
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_y = src_y - src_y_tile_stride + src_stride_y * tile_height;
+      src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2);
+    }
+  }
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
@@ -576,6 +1247,9 @@ void SplitRGBPlane(const uint8_t* src_rgb,
   int y;
   void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                       uint8_t* dst_b, int width) = SplitRGBRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -609,12 +1283,9 @@ void SplitRGBPlane(const uint8_t* src_rgb,
     }
   }
 #endif
-#if defined(HAS_SPLITRGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitRGBRow = SplitRGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      SplitRGBRow = SplitRGBRow_MMI;
-    }
+#if defined(HAS_SPLITRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitRGBRow = SplitRGBRow_RVV;
   }
 #endif
 
@@ -643,6 +1314,9 @@ void MergeRGBPlane(const uint8_t* src_r,
   void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                       const uint8_t* src_b, uint8_t* dst_rgb, int width) =
       MergeRGBRow_C;
+  if (width <= 0 || height == 0) {
+    return;
+  }
   // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
@@ -673,12 +1347,9 @@ void MergeRGBPlane(const uint8_t* src_r,
     }
   }
 #endif
-#if defined(HAS_MERGERGBROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MergeRGBRow = MergeRGBRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MergeRGBRow = MergeRGBRow_MMI;
-    }
+#if defined(HAS_MERGERGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeRGBRow = MergeRGBRow_RVV;
   }
 #endif
 
@@ -692,67 +1363,694 @@ void MergeRGBPlane(const uint8_t* src_r,
   }
 }
 
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height) {
+LIBYUV_NOINLINE
+static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+                                int src_stride_argb,
+                                uint8_t* dst_r,
+                                int dst_stride_r,
+                                uint8_t* dst_g,
+                                int dst_stride_g,
+                                uint8_t* dst_b,
+                                int dst_stride_b,
+                                uint8_t* dst_a,
+                                int dst_stride_a,
+                                int width,
+                                int height) {
   int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                       uint8_t* dst_b, uint8_t* dst_a, int width) =
+      SplitARGBRow_C;
+
+  assert(height > 0);
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  if (src_stride_argb == width * 4 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
+        dst_stride_a = 0;
+  }
+
+#if defined(HAS_SPLITARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitARGBRow = SplitARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      SplitARGBRow = SplitARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitARGBRow = SplitARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      SplitARGBRow = SplitARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitARGBRow = SplitARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitARGBRow = SplitARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitARGBRow = SplitARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitARGBRow = SplitARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitARGBRow = SplitARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    dst_a += dst_stride_a;
+    src_argb += src_stride_argb;
+  }
+}
+
+LIBYUV_NOINLINE
+static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+                                 int src_stride_argb,
+                                 uint8_t* dst_r,
+                                 int dst_stride_r,
+                                 uint8_t* dst_g,
+                                 int dst_stride_g,
+                                 uint8_t* dst_b,
+                                 int dst_stride_b,
+                                 int width,
+                                 int height) {
+  int y;
+  void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                       uint8_t* dst_b, int width) = SplitXRGBRow_C;
+  assert(height > 0);
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  if (src_stride_argb == width * 4 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+
+#if defined(HAS_SPLITXRGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitXRGBRow = SplitXRGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      SplitXRGBRow = SplitXRGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      SplitXRGBRow = SplitXRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitXRGBRow = SplitXRGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitXRGBRow = SplitXRGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitXRGBRow = SplitXRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitXRGBRow = SplitXRGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitXRGBRow = SplitXRGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_argb += src_stride_argb;
+  }
+}
+
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_r,
+                    int dst_stride_r,
+                    uint8_t* dst_g,
+                    int dst_stride_g,
+                    uint8_t* dst_b,
+                    int dst_stride_b,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_a = dst_a + (height - 1) * dst_stride_a;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+    dst_stride_a = -dst_stride_a;
   }
-#if defined(HAS_MIRRORROW_NEON)
+
+  if (dst_a == NULL) {
+    SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+                         dst_stride_g, dst_b, dst_stride_b, width, height);
+  } else {
+    SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+                        dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a,
+                        width, height);
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGBPlaneAlpha(const uint8_t* src_r,
+                                int src_stride_r,
+                                const uint8_t* src_g,
+                                int src_stride_g,
+                                const uint8_t* src_b,
+                                int src_stride_b,
+                                const uint8_t* src_a,
+                                int src_stride_a,
+                                uint8_t* dst_argb,
+                                int dst_stride_argb,
+                                int width,
+                                int height) {
+  int y;
+  void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                       const uint8_t* src_b, const uint8_t* src_a,
+                       uint8_t* dst_argb, int width) = MergeARGBRow_C;
+
+  assert(height > 0);
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeARGBRow = MergeARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      MergeARGBRow = MergeARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeARGBRow = MergeARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeARGBRow = MergeARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
+    MergeARGBRow = MergeARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_NEON;
+      MergeARGBRow = MergeARGBRow_NEON;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
+#if defined(HAS_MERGEARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeARGBRow = MergeARGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGBPlaneOpaque(const uint8_t* src_r,
+                                 int src_stride_r,
+                                 const uint8_t* src_g,
+                                 int src_stride_g,
+                                 const uint8_t* src_b,
+                                 int src_stride_b,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
+                                 int width,
+                                 int height) {
+  int y;
+  void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                       const uint8_t* src_b, uint8_t* dst_argb, int width) =
+      MergeXRGBRow_C;
+
+  assert(height > 0);
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEXRGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeXRGBRow = MergeXRGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXRGBRow = MergeXRGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEXRGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXRGBRow = MergeXRGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
+      MergeXRGBRow = MergeXRGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_AVX2)
+#if defined(HAS_MERGEXRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXRGBRow = MergeXRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXRGBRow = MergeXRGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEXRGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeXRGBRow = MergeXRGBRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+                    int src_stride_r,
+                    const uint8_t* src_g,
+                    int src_stride_g,
+                    const uint8_t* src_b,
+                    int src_stride_b,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  if (src_a == NULL) {
+    MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                         src_stride_b, dst_argb, dst_stride_argb, width,
+                         height);
+  } else {
+    MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                        src_stride_b, src_a, src_stride_a, dst_argb,
+                        dst_stride_argb, width, height);
+  }
+}
+
+// TODO(yuan): Support 2 bit alpha channel.
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    uint8_t* dst_ar30,
+                    int dst_stride_ar30,
+                    int width,
+                    int height,
+                    int depth) {
+  int y;
+  void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, uint8_t* dst_ar30, int depth,
+                       int width) = MergeXR30Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_MERGEXR30ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
+    MergeXR30Row = MergeXR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXR30Row = MergeXR30Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
+#if defined(HAS_MERGEXR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (depth == 10) {
+      MergeXR30Row = MergeXR30Row_10_Any_NEON;
+      if (IS_ALIGNED(width, 8)) {
+        MergeXR30Row = MergeXR30Row_10_NEON;
+      }
+    } else {
+      MergeXR30Row = MergeXR30Row_Any_NEON;
+      if (IS_ALIGNED(width, 8)) {
+        MergeXR30Row = MergeXR30Row_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_ar30 += dst_stride_ar30;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneAlpha(const uint16_t* src_r,
+                                int src_stride_r,
+                                const uint16_t* src_g,
+                                int src_stride_g,
+                                const uint16_t* src_b,
+                                int src_stride_b,
+                                const uint16_t* src_a,
+                                int src_stride_a,
+                                uint16_t* dst_ar64,
+                                int dst_stride_ar64,
+                                int width,
+                                int height,
+                                int depth) {
+  int y;
+  void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, const uint16_t* src_a,
+                       uint16_t* dst_argb, int depth, int width) =
+      MergeAR64Row_C;
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_ar64 = 0;
+  }
+#if defined(HAS_MERGEAR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeAR64Row = MergeAR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeAR64Row = MergeAR64Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
+#if defined(HAS_MERGEAR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeAR64Row = MergeAR64Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
+      MergeAR64Row = MergeAR64Row_NEON;
     }
   }
 #endif
 
-  // Mirror plane
   for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
+    MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_ar64 += dst_stride_ar64;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneOpaque(const uint16_t* src_r,
+                                 int src_stride_r,
+                                 const uint16_t* src_g,
+                                 int src_stride_g,
+                                 const uint16_t* src_b,
+                                 int src_stride_b,
+                                 uint16_t* dst_ar64,
+                                 int dst_stride_ar64,
+                                 int width,
+                                 int height,
+                                 int depth) {
+  int y;
+  void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+                       const uint16_t* src_b, uint16_t* dst_argb, int depth,
+                       int width) = MergeXR64Row_C;
+
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
+  }
+#if defined(HAS_MERGEXR64ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXR64Row = MergeXR64Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXR64Row = MergeXR64Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEXR64ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXR64Row = MergeXR64Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXR64Row = MergeXR64Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_ar64 += dst_stride_ar64;
+  }
+}
+
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+                    int src_stride_r,
+                    const uint16_t* src_g,
+                    int src_stride_g,
+                    const uint16_t* src_b,
+                    int src_stride_b,
+                    const uint16_t* src_a,
+                    int src_stride_a,
+                    uint16_t* dst_ar64,
+                    int dst_stride_ar64,
+                    int width,
+                    int height,
+                    int depth) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
+    dst_stride_ar64 = -dst_stride_ar64;
+  }
+
+  if (src_a == NULL) {
+    MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                         src_stride_b, dst_ar64, dst_stride_ar64, width, height,
+                         depth);
+  } else {
+    MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                        src_stride_b, src_a, src_stride_a, dst_ar64,
+                        dst_stride_ar64, width, height, depth);
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
+                                     int src_stride_r,
+                                     const uint16_t* src_g,
+                                     int src_stride_g,
+                                     const uint16_t* src_b,
+                                     int src_stride_b,
+                                     const uint16_t* src_a,
+                                     int src_stride_a,
+                                     uint8_t* dst_argb,
+                                     int dst_stride_argb,
+                                     int width,
+                                     int height,
+                                     int depth) {
+  int y;
+  void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+                            const uint16_t* src_b, const uint16_t* src_a,
+                            uint8_t* dst_argb, int depth, int width) =
+      MergeARGB16To8Row_C;
+
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      src_stride_a == width && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+        dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEARGB16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeARGB16To8Row = MergeARGB16To8Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    src_a += src_stride_a;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
+                                      int src_stride_r,
+                                      const uint16_t* src_g,
+                                      int src_stride_g,
+                                      const uint16_t* src_b,
+                                      int src_stride_b,
+                                      uint8_t* dst_argb,
+                                      int dst_stride_argb,
+                                      int width,
+                                      int height,
+                                      int depth) {
+  int y;
+  void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+                            const uint16_t* src_b, uint8_t* dst_argb, int depth,
+                            int width) = MergeXRGB16To8Row_C;
+
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+  }
+#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_argb += dst_stride_argb;
+  }
+}
+
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+                         int src_stride_r,
+                         const uint16_t* src_g,
+                         int src_stride_g,
+                         const uint16_t* src_b,
+                         int src_stride_b,
+                         const uint16_t* src_a,
+                         int src_stride_a,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height,
+                         int depth) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  if (src_a == NULL) {
+    MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                              src_stride_b, dst_argb, dst_stride_argb, width,
+                              height, depth);
+  } else {
+    MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+                             src_stride_b, src_a, src_stride_a, dst_argb,
+                             dst_stride_argb, width, height, depth);
   }
 }
 
@@ -820,7 +2118,7 @@ int YUY2ToI422(const uint8_t* src_yuy2,
     }
   }
 #endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     YUY2ToYRow = YUY2ToYRow_Any_MSA;
     YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
@@ -830,13 +2128,23 @@ int YUY2ToI422(const uint8_t* src_yuy2,
     }
   }
 #endif
-#if defined(HAS_YUY2TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
-      YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_LSX;
+      YUY2ToUV422Row = YUY2ToUV422Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LASX;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_LASX;
+      YUY2ToUV422Row = YUY2ToUV422Row_LASX;
     }
   }
 #endif
@@ -916,7 +2224,7 @@ int UYVYToI422(const uint8_t* src_uyvy,
     }
   }
 #endif
-#if defined(HAS_UYVYTOYROW_MSA)
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     UYVYToYRow = UYVYToYRow_Any_MSA;
     UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
@@ -926,13 +2234,23 @@ int UYVYToI422(const uint8_t* src_uyvy,
     }
   }
 #endif
-#if defined(HAS_UYVYTOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    UYVYToYRow = UYVYToYRow_Any_MMI;
-    UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    UYVYToUV422Row = UYVYToUV422Row_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToYRow = UYVYToYRow_MMI;
-      UYVYToUV422Row = UYVYToUV422Row_MMI;
+      UYVYToYRow = UYVYToYRow_LSX;
+      UYVYToUV422Row = UYVYToUV422Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    UYVYToYRow = UYVYToYRow_Any_LASX;
+    UYVYToUV422Row = UYVYToUV422Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_LASX;
+      UYVYToUV422Row = UYVYToUV422Row_LASX;
     }
   }
 #endif
@@ -1006,23 +2324,238 @@ int YUY2ToY(const uint8_t* src_yuy2,
     }
   }
 #endif
-#if defined(HAS_YUY2TOYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    YUY2ToYRow = YUY2ToYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      YUY2ToYRow = YUY2ToYRow_MMI;
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Convert UYVY to Y.
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+            int src_stride_uyvy,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    UYVYToYRow = UYVYToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_LSX;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
     dst_y += dst_stride_y;
   }
   return 0;
 }
 
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    MirrorRow = MirrorRow_Any_LASX;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_LASX;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height) {
+  int y;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+      MirrorUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorUVRow = MirrorUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorUVRow = MirrorUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorUVRow = MirrorUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorUVRow = MirrorUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorUVRow = MirrorUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorUVRow = MirrorUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorUVRow = MirrorUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    MirrorUVRow = MirrorUVRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorUVRow = MirrorUVRow_LASX;
+    }
+  }
+#endif
+
+  // MirrorUV plane
+  for (y = 0; y < height; ++y) {
+    MirrorUVRow(src_uv, dst_uv, width);
+    src_uv += src_stride_uv;
+    dst_uv += dst_stride_uv;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
@@ -1063,10 +2596,12 @@ int I420Mirror(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+
+  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
       height == 0) {
     return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -1087,6 +2622,43 @@ int I420Mirror(const uint8_t* src_y,
   return 0;
 }
 
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+                halfheight);
+  return 0;
+}
+
 // ARGB mirror.
 LIBYUV_API
 int ARGBMirror(const uint8_t* src_argb,
@@ -1110,7 +2682,7 @@ int ARGBMirror(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -1139,11 +2711,19 @@ int ARGBMirror(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_LASX;
     }
   }
 #endif
@@ -1157,35 +2737,50 @@ int ARGBMirror(const uint8_t* src_argb,
   return 0;
 }
 
-// Get a blender that optimized for the CPU and pixel count.
-// As there are 6 blenders to choose from, the caller should try to use
-// the same blend function for all pixels if possible.
+// RGB24 mirror.
 LIBYUV_API
-ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
-    return ARGBBlendRow;
+int RGB24Mirror(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  int y;
+  void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+      RGB24MirrorRow_C;
+  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
   }
-#endif
-#if defined(HAS_ARGBBLENDROW_NEON)
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+#if defined(HAS_RGB24MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBBlendRow = ARGBBlendRow_NEON;
+    RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_ARGBBLENDROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    ARGBBlendRow = ARGBBlendRow_MSA;
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_ARGBBLENDROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBBlendRow = ARGBBlendRow_MMI;
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_rgb24 += dst_stride_rgb24;
   }
-#endif
-  return ARGBBlendRow;
+  return 0;
 }
 
 // Alpha Blend 2 ARGB images and store to destination.
@@ -1200,7 +2795,7 @@ int ARGBBlend(const uint8_t* src_argb0,
               int height) {
   int y;
   void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
-                       uint8_t* dst_argb, int width) = GetARGBBlend();
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1217,7 +2812,31 @@ int ARGBBlend(const uint8_t* src_argb0,
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBBlendRow = ARGBBlendRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBBlendRow = ARGBBlendRow_RVV;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
     src_argb0 += src_stride_argb0;
@@ -1277,12 +2896,9 @@ int BlendPlane(const uint8_t* src_y0,
     }
   }
 #endif
-#if defined(HAS_BLENDPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BlendPlaneRow = BlendPlaneRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      BlendPlaneRow = BlendPlaneRow_MMI;
-    }
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
   }
 #endif
 
@@ -1329,6 +2945,7 @@ int I420Blend(const uint8_t* src_y0,
       BlendPlaneRow_C;
   void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                         uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+
   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
       !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
@@ -1361,12 +2978,9 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
-#if defined(HAS_BLENDPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    BlendPlaneRow = BlendPlaneRow_Any_MMI;
-    if (IS_ALIGNED(halfwidth, 8)) {
-      BlendPlaneRow = BlendPlaneRow_MMI;
-    }
+#if defined(HAS_BLENDPLANEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BlendPlaneRow = BlendPlaneRow_RVV;
   }
 #endif
   if (!IS_ALIGNED(width, 2)) {
@@ -1405,20 +3019,16 @@ int I420Blend(const uint8_t* src_y0,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
-      if (IS_ALIGNED(halfwidth, 8)) {
-        ScaleRowDown2 = ScaleRowDown2Box_MMI;
-      }
-    }
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = ScaleRowDown2Box_RVV;
   }
 #endif
 
   // Row buffer for intermediate alpha pixels.
   align_buffer_64(halfalpha, halfwidth);
+  if (!halfalpha)
+    return 1;
   for (y = 0; y < height; y += 2) {
     // last row of odd height image use 1 row of alpha instead of 2.
     if (y == (height - 1)) {
@@ -1501,11 +3111,19 @@ int ARGBMultiply(const uint8_t* src_argb0,
     }
   }
 #endif
-#if defined(HAS_ARGBMULTIPLYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+#if defined(HAS_ARGBMULTIPLYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_LASX;
     }
   }
 #endif
@@ -1549,12 +3167,12 @@ int ARGBAdd(const uint8_t* src_argb0,
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_SSE2;
   }
 #endif
-#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
@@ -1586,11 +3204,19 @@ int ARGBAdd(const uint8_t* src_argb0,
     }
   }
 #endif
-#if defined(HAS_ARGBADDROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAddRow = ARGBAddRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAddRow = ARGBAddRow_MMI;
+#if defined(HAS_ARGBADDROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAddRow = ARGBAddRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBAddRow = ARGBAddRow_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_LASX;
     }
   }
 #endif
@@ -1666,11 +3292,19 @@ int ARGBSubtract(const uint8_t* src_argb0,
     }
   }
 #endif
-#if defined(HAS_ARGBSUBTRACTROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBSubtractRow = ARGBSubtractRow_MMI;
+#if defined(HAS_ARGBSUBTRACTROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_LASX;
     }
   }
 #endif
@@ -1684,177 +3318,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
   }
   return 0;
 }
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
-                            int src_stride_y,
-                            const uint8_t* src_u,
-                            int src_stride_u,
-                            const uint8_t* src_v,
-                            int src_stride_v,
-                            uint8_t* dst_rgba,
-                            int dst_stride_rgba,
-                            const struct YuvConstants* yuvconstants,
-                            int width,
-                            int height) {
-  int y;
-  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
-                        const uint8_t* v_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
-    dst_stride_rgba = -dst_stride_rgba;
-  }
-#if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToRGBARow = I422ToRGBARow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    I422ToRGBARow = I422ToRGBARow_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
-    dst_rgba += dst_stride_rgba;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_rgba,
-               int dst_stride_rgba,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                          src_stride_v, dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_bgra,
-               int dst_stride_bgra,
-               int width,
-               int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
-                          src_stride_v,  // Swap U and V
-                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
-                          &kYvuI601Constants,  // Use Yvu matrix
-                          width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
-                 int src_stride_y,
-                 const uint8_t* src_uv,
-                 int src_stride_uv,
-                 uint8_t* dst_rgb565,
-                 int dst_stride_rgb565,
-                 int width,
-                 int height) {
-  int y;
-  void (*NV12ToRGB565Row)(
-      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
-    dst_stride_rgb565 = -dst_stride_rgb565;
-  }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
-    if (IS_ALIGNED(width, 8)) {
-      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
-    dst_rgb565 += dst_stride_rgb565;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_uv += src_stride_uv;
-    }
-  }
-  return 0;
-}
 
 // Convert RAW to RGB24.
 LIBYUV_API
@@ -1906,14 +3369,19 @@ int RAWToRGB24(const uint8_t* src_raw,
     }
   }
 #endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
-    if (IS_ALIGNED(width, 4)) {
-      RAWToRGB24Row = RAWToRGB24Row_MMI;
+#if defined(HAS_RAWTORGB24ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGB24Row = RAWToRGB24Row_LSX;
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToRGB24Row = RAWToRGB24Row_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1923,6 +3391,7 @@ int RAWToRGB24(const uint8_t* src_raw,
   return 0;
 }
 
+// TODO(fbarchard): Consider uint8_t value
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
               int dst_stride_y,
@@ -1930,7 +3399,11 @@ void SetPlane(uint8_t* dst_y,
               int height,
               uint32_t value) {
   int y;
-  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+  void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C;
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
   if (height < 0) {
     height = -height;
     dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1968,10 +3441,18 @@ void SetPlane(uint8_t* dst_y,
     SetRow = SetRow_MSA;
   }
 #endif
+#if defined(HAS_SETROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SetRow = SetRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_LSX;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
+    SetRow(dst_y, (uint8_t)value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -1996,6 +3477,7 @@ int I420Rect(uint8_t* dst_y,
   uint8_t* start_y = dst_y + y * dst_stride_y + x;
   uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
   uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
   if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
       y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
@@ -2018,7 +3500,7 @@ int ARGBRect(uint8_t* dst_argb,
              int height,
              uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+  void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) =
       ARGBSetRow_C;
   if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -2057,6 +3539,14 @@ int ARGBRect(uint8_t* dst_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBSETROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBSetRow = ARGBSetRow_Any_LSX;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_LSX;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -2135,14 +3625,27 @@ int ARGBAttenuate(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_LASX;
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -2243,9 +3746,14 @@ int ARGBGrayTo(const uint8_t* src_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
-#if defined(HAS_ARGBGRAYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+    ARGBGrayRow = ARGBGrayRow_LASX;
   }
 #endif
 
@@ -2293,9 +3801,14 @@ int ARGBGray(uint8_t* dst_argb,
     ARGBGrayRow = ARGBGrayRow_MSA;
   }
 #endif
-#if defined(HAS_ARGBGRAYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+    ARGBGrayRow = ARGBGrayRow_LASX;
   }
 #endif
 
@@ -2315,7 +3828,7 @@ int ARGBSepia(uint8_t* dst_argb,
               int width,
               int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -2341,9 +3854,14 @@ int ARGBSepia(uint8_t* dst_argb,
     ARGBSepiaRow = ARGBSepiaRow_MSA;
   }
 #endif
-#if defined(HAS_ARGBSEPIAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBSepiaRow = ARGBSepiaRow_MMI;
+#if defined(HAS_ARGBSEPIAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+    ARGBSepiaRow = ARGBSepiaRow_LASX;
   }
 #endif
 
@@ -2397,9 +3915,9 @@ int ARGBColorMatrix(const uint8_t* src_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
   }
 #endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+#if defined(HAS_ARGBCOLORMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_LSX;
   }
 #endif
   for (y = 0; y < height; ++y) {
@@ -2458,7 +3976,7 @@ int ARGBColorTable(uint8_t* dst_argb,
                    int width,
                    int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -2494,7 +4012,7 @@ int RGBColorTable(uint8_t* dst_argb,
                   int width,
                   int height) {
   int y;
-  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+  void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
@@ -2539,7 +4057,7 @@ int ARGBQuantize(uint8_t* dst_argb,
                  int width,
                  int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
   uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -2567,6 +4085,11 @@ int ARGBQuantize(uint8_t* dst_argb,
     ARGBQuantizeRow = ARGBQuantizeRow_MSA;
   }
 #endif
+#if defined(HAS_ARGBQUANTIZEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_LSX;
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
     dst += dst_stride_argb;
@@ -2596,11 +4119,6 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb,
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
   }
 #endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
-  }
-#endif
 
   memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
   for (y = 0; y < height; ++y) {
@@ -2651,7 +4169,7 @@ int ARGBBlur(const uint8_t* src_argb,
   if (radius > (width / 2 - 1)) {
     radius = width / 2 - 1;
   }
-  if (radius <= 0) {
+  if (radius <= 0 || height <= 1) {
     return -1;
   }
 #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
@@ -2660,11 +4178,6 @@ int ARGBBlur(const uint8_t* src_argb,
     CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
   }
 #endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
-  }
-#endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
   ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2771,9 +4284,14 @@ int ARGBShade(const uint8_t* src_argb,
     ARGBShadeRow = ARGBShadeRow_MSA;
   }
 #endif
-#if defined(HAS_ARGBSHADEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
-    ARGBShadeRow = ARGBShadeRow_MMI;
+#if defined(HAS_ARGBSHADEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_LSX;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_LASX;
   }
 #endif
 
@@ -2797,7 +4315,7 @@ int InterpolatePlane(const uint8_t* src0,
                      int height,
                      int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -2847,14 +4365,19 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -2865,6 +4388,86 @@ int InterpolatePlane(const uint8_t* src0,
   return 0;
 }
 
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+                        int src_stride0,
+                        const uint16_t* src1,
+                        int src_stride1,
+                        uint16_t* dst,
+                        int dst_stride,
+                        int width,
+                        int height,
+                        int interpolation) {
+  int y;
+  void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr,
+                            ptrdiff_t src_stride, int dst_width,
+                            int source_y_fraction) = InterpolateRow_16_C;
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst = dst + (height - 1) * dst_stride;
+    dst_stride = -dst_stride;
+  }
+  // Coalesce rows.
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+    width *= height;
+    height = 1;
+    src_stride0 = src_stride1 = dst_stride = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow_16 = InterpolateRow_16_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow_16 = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow_16 = InterpolateRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow_16 = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow_16 = InterpolateRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow_16 = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow_16 = InterpolateRow_16_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow_16 = InterpolateRow_16_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow_16 = InterpolateRow_16_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow_16 = InterpolateRow_16_LSX;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow_16(dst, src0, src1 - src0, width, interpolation);
+    src0 += src_stride0;
+    src1 += src_stride1;
+    dst += dst_stride;
+  }
+  return 0;
+}
+
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 LIBYUV_API
 int ARGBInterpolate(const uint8_t* src_argb0,
@@ -2906,10 +4509,12 @@ int I420Interpolate(const uint8_t* src0_y,
                     int interpolation) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
+
   if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
       !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
+
   InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
                    dst_stride_y, width, height, interpolation);
   InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
@@ -2978,11 +4583,19 @@ int ARGBShuffle(const uint8_t* src_bgra,
     }
   }
 #endif
-#if defined(HAS_ARGBSHUFFLEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBShuffleRow = ARGBShuffleRow_MMI;
+#if defined(HAS_ARGBSHUFFLEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_LASX;
     }
   }
 #endif
@@ -2995,6 +4608,144 @@ int ARGBShuffle(const uint8_t* src_bgra,
   return 0;
 }
 
+// Shuffle AR64 channel order.  e.g. AR64 to AB64.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+                int src_stride_ar64,
+                uint16_t* dst_ar64,
+                int dst_stride_ar64,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
+  int y;
+  void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
+                         const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
+  if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+    src_stride_ar64 = -src_stride_ar64;
+  }
+  // Coalesce rows.
+  if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar64 = dst_stride_ar64 = 0;
+  }
+  // Assembly versions can be reused if it's implemented with shuffle.
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      AR64ShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      AR64ShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      AR64ShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler,
+                   width * 2);
+    src_ar64 += src_stride_ar64;
+    dst_ar64 += dst_stride_ar64;
+  }
+  return 0;
+}
+
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+                   int src_stride,
+                   float* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  int y;
+  void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+                       const float* src3, const float* src4, float* dst,
+                       int width) = GaussCol_F32_C;
+  void (*GaussRow_F32)(const float* src, float* dst, int width) =
+      GaussRow_F32_C;
+  if (!src || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussCol_F32 = GaussCol_F32_NEON;
+  }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    GaussRow_F32 = GaussRow_F32_NEON;
+  }
+#endif
+  {
+    // 2 pixels on each side, but aligned out to 16 bytes.
+    align_buffer_64(rowbuf, (4 + width + 4) * 4);
+    if (!rowbuf)
+      return 1;
+    memset(rowbuf, 0, 16);
+    memset(rowbuf + (4 + width) * 4, 0, 16);
+    float* row = (float*)(rowbuf + 16);
+    const float* src0 = src;
+    const float* src1 = src;
+    const float* src2 = src;
+    const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+    const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+    for (y = 0; y < height; ++y) {
+      GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+      // Extrude edge by 2 floats
+      row[-2] = row[-1] = row[0];
+      row[width + 1] = row[width] = row[width - 1];
+
+      GaussRow_F32(row - 2, dst, width);
+
+      src0 = src1;
+      src1 = src2;
+      src2 = src3;
+      src3 = src4;
+      if ((y + 2) < (height - 1)) {
+        src4 += src_stride;
+      }
+      dst += dst_stride;
+    }
+    free_aligned_buffer_64(rowbuf);
+  }
+  return 0;
+}
+
 // Sobel ARGB effect.
 static int ARGBSobelize(const uint8_t* src_argb,
                         int src_stride_argb,
@@ -3044,7 +4795,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
 #if defined(HAS_ARGBTOYJROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
   }
@@ -3057,14 +4808,27 @@ static int ARGBSobelize(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBToYJRow = ARGBToYJRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_LSX;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYJRow = ARGBToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYJRow = ARGBToYJRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -3081,11 +4845,6 @@ static int ARGBSobelize(const uint8_t* src_argb,
     SobelYRow = SobelYRow_MSA;
   }
 #endif
-#if defined(HAS_SOBELYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelYRow = SobelYRow_MMI;
-  }
-#endif
 #if defined(HAS_SOBELXROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXRow = SobelXRow_SSE2;
@@ -3101,23 +4860,20 @@ static int ARGBSobelize(const uint8_t* src_argb,
     SobelXRow = SobelXRow_MSA;
   }
 #endif
-#if defined(HAS_SOBELXROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelXRow = SobelXRow_MMI;
-  }
-#endif
   {
     // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 31) & ~31;
-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    const int row_size = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge));
     uint8_t* row_sobelx = rows;
-    uint8_t* row_sobely = rows + kRowSize;
-    uint8_t* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobely = rows + row_size;
+    uint8_t* row_y = rows + row_size * 2;
 
     // Convert first row.
     uint8_t* row_y0 = row_y + kEdge;
-    uint8_t* row_y1 = row_y0 + kRowSize;
-    uint8_t* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y1 = row_y0 + row_size;
+    uint8_t* row_y2 = row_y1 + row_size;
+    if (!rows)
+      return 1;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -3188,11 +4944,11 @@ int ARGBSobel(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_SOBELROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelRow = SobelRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelRow = SobelRow_MMI;
+#if defined(HAS_SOBELROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SobelRow = SobelRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_LSX;
     }
   }
 #endif
@@ -3234,11 +4990,11 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_SOBELTOPLANEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelToPlaneRow = SobelToPlaneRow_MMI;
+#if defined(HAS_SOBELTOPLANEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_LSX;
     }
   }
 #endif
@@ -3281,11 +5037,11 @@ int ARGBSobelXY(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_SOBELXYROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SobelXYRow = SobelXYRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SobelXYRow = SobelXYRow_MMI;
+#if defined(HAS_SOBELXYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SobelXYRow = SobelXYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_LSX;
     }
   }
 #endif
@@ -3412,6 +5168,14 @@ int HalfFloatPlane(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_HALFFLOATROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    HalfFloatRow = HalfFloatRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      HalfFloatRow = HalfFloatRow_LSX;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     HalfFloatRow(src_y, dst_y, scale, width);
@@ -3526,14 +5290,6 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBCOPYALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
-    }
-  }
-#endif
 
   for (y = 0; y < height; ++y) {
     ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3592,10 +5348,15 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
                                                 : ARGBExtractAlphaRow_Any_MSA;
   }
 #endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
-                                               : ARGBExtractAlphaRow_Any_MMI;
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+                                                : ARGBExtractAlphaRow_Any_LSX;
+  }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
   }
 #endif
 
@@ -3649,12 +5410,9 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
-    }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV;
   }
 #endif
 
@@ -3666,9 +5424,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
   return 0;
 }
 
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
 LIBYUV_API
 int YUY2ToNV12(const uint8_t* src_yuy2,
                int src_stride_yuy2,
@@ -3679,124 +5434,105 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
                int width,
                int height) {
   int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
-                     int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) = InterpolateRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
+                        uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
   if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-#if defined(HAS_SPLITUVROW_SSE2)
+#if defined(HAS_YUY2TOYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_AVX2)
+#if defined(HAS_YUY2TOYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_NEON)
+#if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
+      YUY2ToYRow = YUY2ToYRow_NEON;
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
-    SplitUVRow = SplitUVRow_Any_MSA;
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
-      SplitUVRow = SplitUVRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
+      YUY2ToYRow = YUY2ToYRow_MSA;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_SSSE3;
+#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_SSSE3;
+      YUY2ToYRow = YUY2ToYRow_LSX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_AVX2;
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    YUY2ToYRow = YUY2ToYRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_LASX;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_NEON;
+
+#if defined(HAS_YUY2TONVUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      InterpolateRow = InterpolateRow_NEON;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    InterpolateRow = InterpolateRow_Any_MSA;
+#if defined(HAS_YUY2TONVUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      InterpolateRow = InterpolateRow_MSA;
+      YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_YUY2TONVUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
     }
   }
 #endif
 
-  {
-    int awidth = halfwidth * 2;
-    // row of y and 2 rows of uv
-    align_buffer_64(rows, awidth * 3);
-
-    for (y = 0; y < height - 1; y += 2) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
-      memcpy(dst_y, rows, width);
-      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
-      memcpy(dst_y + dst_stride_y, rows, width);
-      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
-      src_yuy2 += src_stride_yuy2 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
-    }
-    if (height & 1) {
-      // Split Y from UV.
-      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
-      memcpy(dst_y, rows, width);
-    }
-    free_aligned_buffer_64(rows);
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
   }
   return 0;
 }
@@ -3814,12 +5550,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   int halfwidth = (width + 1) >> 1;
   void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
+
   if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
+
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -3858,14 +5596,20 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    SplitUVRow = SplitUVRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    SplitUVRow = SplitUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_LSX;
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
+
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3898,19 +5642,26 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
     // row of y and 2 rows of uv
     align_buffer_64(rows, awidth * 3);
+    if (!rows)
+      return 1;
 
     for (y = 0; y < height - 1; y += 2) {
       // Split Y from UV.
@@ -3933,6 +5684,57 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   return 0;
 }
 
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height) {
+  int y;
+  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+                         const uint8_t* src_v, int src_stride_v,
+                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_NEON;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    HalfMergeUVRow = HalfMergeUVRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    // Merge a row of U and V into a row of UV.
+    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+    src_u += src_stride_u * 2;
+    src_v += src_stride_v * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/rotate.cc b/source/rotate.cc
new file mode 100644
index 00000000..3f8332c3
--- /dev/null
+++ b/source/rotate.cc
@@ -0,0 +1,1231 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  int i = height;
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_NEON;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX16_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    TransposeWx16 = TransposeWx16_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_LSX;
+    }
+  }
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+#endif
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  assert(row);
+  if (!row)
+    return;
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MirrorRow = MirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    MirrorRow = MirrorRow_Any_LASX;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    CopyRow(src, row, width);        // Copy top row into buffer
+    MirrorRow(src_bot, dst, width);  // Mirror bottom row into top row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into bottom row
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void SplitTransposeUV(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  int i = height;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
+  }
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_LSX;
+    }
+  }
+#else
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx8 = TransposeUVWx8_SSE2;
+    }
+  }
+#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
+    i -= 8;
+  }
+#endif
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void SplitRotateUV90(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     uint8_t* dst_b,
+                     int dst_stride_b,
+                     int width,
+                     int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width, height);
+}
+
+LIBYUV_API
+void SplitRotateUV270(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void SplitRotateUV180(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  int i;
+  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                           int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_MSA;
+  }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_LSX;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorSplitUVRow(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+// Rotate UV and split into planar.
+// width and height expected to be half size for NV12
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height,
+                  enum RotationMode mode) {
+  if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                   dst_stride_v, width, height);
+      return 0;
+    case kRotate90:
+      SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                      dst_stride_v, width, height);
+      return 0;
+    case kRotate270:
+      SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                       dst_stride_v, width, height);
+      return 0;
+    case kRotate180:
+      SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                       dst_stride_v, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+static void TransposePlane_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  int i = height;
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8_16_C(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+static void RotatePlane90_16(const uint16_t* src,
+                             int src_stride,
+                             uint16_t* dst,
+                             int dst_stride,
+                             int width,
+                             int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane270_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
+}
+
+static void RotatePlane180_16(const uint16_t* src,
+                              int src_stride,
+                              uint16_t* dst,
+                              int dst_stride,
+                              int width,
+                              int height) {
+  const uint16_t* src_bot = src + src_stride * (height - 1);
+  uint16_t* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+
+  // Swap top and bottom row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 2);
+  uint16_t* row_tmp = (uint16_t*)row;
+  assert(row);
+  if (!row)
+    return;
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    CopyRow_16_C(src, row_tmp, width);        // Copy top row into buffer
+    MirrorRow_16_C(src_bot, dst, width);      // Mirror bottom row into top row
+    MirrorRow_16_C(row_tmp, dst_bot, width);  // Mirror buffer into bottom row
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int RotatePlane_16(const uint16_t* src,
+                   int src_stride,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height,
+                   enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src, src_stride, dst, dst_stride, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+// I422 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  int r;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // Copy frame
+      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+      return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+    case kRotate90:
+      RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                    height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                     dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                    height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                     dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                     height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                     dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                     height);
+      r = ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                     dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     height);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                      dst_stride_v, halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                       dst_stride_v, halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                       dst_stride_v, halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+static void SplitPixels(const uint8_t* src_u,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_u,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst_u = *src_u;
+    ++dst_u;
+    src_u += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to I420 with Rotate
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           uint8_t* dst_u,
+                           int dst_stride_u,
+                           uint8_t* dst_v,
+                           int dst_stride_v,
+                           int width,
+                           int height,
+                           enum RotationMode rotation) {
+  int y;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                rotation);
+  }
+
+  // Copy UV planes - I420
+  if (src_pixel_stride_uv == 1) {
+    RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight,
+                rotation);
+    RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight,
+                rotation);
+    return 0;
+  }
+  // Split UV planes - NV21
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+                  halfwidth, halfheight, rotation);
+    return 0;
+  }
+  // Split UV planes - NV12
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                  halfwidth, halfheight, rotation);
+    return 0;
+  }
+
+  if (rotation == 0) {
+    for (y = 0; y < halfheight; ++y) {
+      SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+      SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    return 0;
+  }
+  // unsupported type and/or rotation.
+  return -1;
+}
+
+LIBYUV_API
+int I010Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                       halfheight);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                       halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        halfheight);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+// I210 has half width x full height UV planes, so rotate by 90 and 270
+// require scaling to maintain 422 subsampling.
+LIBYUV_API
+int I210Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  int r;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // Copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+      return 0;
+
+      // Note on temporary Y plane for UV.
+      // Rotation of UV first fits within the Y destination plane rows.
+      // Y plane is width x height
+      // Y plane rotated is height x width
+      // UV plane is (width / 2) x height
+      // UV plane rotated is height x (width / 2)
+      // UV plane rotated+scaled is (height / 2) x width.
+      // UV plane rotated is a temporary that fits within the Y plane rotated.
+
+    case kRotate90:
+      RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                       height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                        dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                       height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                        dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth,
+                        height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u,
+                        dst_stride_u, halfheight, width, kFilterBilinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth,
+                        height);
+      r = ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v,
+                        dst_stride_v, halfheight, width, kFilterLinear);
+      if (r != 0) {
+        return r;
+      }
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I410Rotate(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    case kRotate180:
+      RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width,
+                        height);
+      RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width,
+                        height);
+      RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width,
+                        height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/rotate_any.cc b/source/rotate_any.cc
index b3baf084..88ca7876 100644
--- a/files/source/rotate_any.cc
+++ b/source/rotate_any.cc
@@ -35,15 +35,15 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEWX8_SSSE3
 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
 #endif
-#ifdef HAS_TRANSPOSEWX8_MMI
-TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
-#endif
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
 #endif
 #ifdef HAS_TRANSPOSEWX16_MSA
 TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
 #endif
+#ifdef HAS_TRANSPOSEWX16_LSX
+TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, 15)
+#endif
 #undef TANY
 
 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
@@ -65,12 +65,12 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEUVWX8_SSE2
 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
 #endif
-#ifdef HAS_TRANSPOSEUVWX8_MMI
-TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
-#endif
 #ifdef HAS_TRANSPOSEUVWX16_MSA
 TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
 #endif
+#ifdef HAS_TRANSPOSEUVWX16_LSX
+TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7)
+#endif
 #undef TUVANY
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_argb.cc b/source/rotate_argb.cc
index a93fd55f..d55fac4f 100644
--- a/files/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
 
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
@@ -21,17 +22,21 @@ namespace libyuv {
 extern "C" {
 #endif
 
-static void ARGBTranspose(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   int i;
   int src_pixel_step = src_stride_argb >> 2;
   void (*ScaleARGBRowDownEven)(
       const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
       uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+  // Check stride is a multiple of 4.
+  if (src_stride_argb & 3) {
+    return -1;
+  }
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -56,60 +61,65 @@ static void ARGBTranspose(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX;
     if (IS_ALIGNED(height, 4)) {  // Width of dest.
-      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX;
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
     ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
     dst_argb += dst_stride_argb;
     src_argb += 4;
   }
+  return 0;
 }
 
-void ARGBRotate90(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
   src_argb += src_stride_argb * (height - 1);
   src_stride_argb = -src_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate270(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
   dst_argb += dst_stride_argb * (width - 1);
   dst_stride_argb = -dst_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }
 
-void ARGBRotate180(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
-  align_buffer_64(row, width * 4);
   const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
   uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
   int half_height = (height + 1) >> 1;
@@ -118,10 +128,13 @@ void ARGBRotate180(const uint8_t* src_argb,
       ARGBMirrorRow_C;
   void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       CopyRow_C;
+  align_buffer_64(row, width * 4);
+  if (!row)
+    return 1;
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBMirrorRow = ARGBMirrorRow_NEON;
     }
   }
@@ -150,11 +163,19 @@ void ARGBRotate180(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LSX;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_LASX;
     }
   }
 #endif
@@ -178,6 +199,11 @@ void ARGBRotate180(const uint8_t* src_argb,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
@@ -190,6 +216,7 @@ void ARGBRotate180(const uint8_t* src_argb,
     dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 LIBYUV_API
@@ -217,17 +244,14 @@ int ARGBRotate(const uint8_t* src_argb,
       return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                   height);
-      return 0;
+      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                          width, height);
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
     default:
       break;
   }
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
new file mode 100644
index 00000000..e72608e9
--- /dev/null
+++ b/source/rotate_common.cc
@@ -0,0 +1,198 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)];
+      dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+void TransposeWx8_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeWxH_16_C(const uint16_t* src,
+                       int src_stride,
+                       uint16_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  int i;
+  for (i = 0; i < width; i += 4) {
+    uint32_t p00 = ((uint32_t*)(src))[0];
+    uint32_t p10 = ((uint32_t*)(src))[1];
+    uint32_t p20 = ((uint32_t*)(src))[2];
+    uint32_t p30 = ((uint32_t*)(src))[3];
+    uint32_t p01 = ((uint32_t*)(src1))[0];
+    uint32_t p11 = ((uint32_t*)(src1))[1];
+    uint32_t p21 = ((uint32_t*)(src1))[2];
+    uint32_t p31 = ((uint32_t*)(src1))[3];
+    uint32_t p02 = ((uint32_t*)(src2))[0];
+    uint32_t p12 = ((uint32_t*)(src2))[1];
+    uint32_t p22 = ((uint32_t*)(src2))[2];
+    uint32_t p32 = ((uint32_t*)(src2))[3];
+    uint32_t p03 = ((uint32_t*)(src3))[0];
+    uint32_t p13 = ((uint32_t*)(src3))[1];
+    uint32_t p23 = ((uint32_t*)(src3))[2];
+    uint32_t p33 = ((uint32_t*)(src3))[3];
+    ((uint32_t*)(dst))[0] = p00;
+    ((uint32_t*)(dst))[1] = p01;
+    ((uint32_t*)(dst))[2] = p02;
+    ((uint32_t*)(dst))[3] = p03;
+    ((uint32_t*)(dst1))[0] = p10;
+    ((uint32_t*)(dst1))[1] = p11;
+    ((uint32_t*)(dst1))[2] = p12;
+    ((uint32_t*)(dst1))[3] = p13;
+    ((uint32_t*)(dst2))[0] = p20;
+    ((uint32_t*)(dst2))[1] = p21;
+    ((uint32_t*)(dst2))[2] = p22;
+    ((uint32_t*)(dst2))[3] = p23;
+    ((uint32_t*)(dst3))[0] = p30;
+    ((uint32_t*)(dst3))[1] = p31;
+    ((uint32_t*)(dst3))[2] = p32;
+    ((uint32_t*)(dst3))[3] = p33;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
+    dst += 4 * 4;  // advance 4 columns
+    dst1 += 4 * 4;
+    dst2 += 4 * 4;
+    dst3 += 4 * 4;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc
new file mode 100644
index 00000000..fd5eee05
--- /dev/null
+++ b/source/rotate_gcc.cc
@@ -0,0 +1,503 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movq        (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "movq        (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movq        (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "movq        (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movq        (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "lea         0x8(%0,%3,8),%0               \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "neg         %3                            \n"
+      // Second round of bit swap.
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "sub         $0x8,%2                       \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%3),%%xmm1                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm9                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm1            \n"
+      "palignr     $0x8,%%xmm9,%%xmm9            \n"
+      "movdqu      (%0,%3),%%xmm3                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm10                \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm10                \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm10,%%xmm11               \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "movdqu      (%0,%3),%%xmm5                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm12                \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm12                \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "movdqa      %%xmm12,%%xmm13               \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movdqu      (%0,%3),%%xmm7                \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm14                \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "punpckhbw   %%xmm7,%%xmm14                \n"
+      "neg         %3                            \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "movdqa      %%xmm14,%%xmm15               \n"
+      "lea         0x10(%0,%3,8),%0              \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "neg         %3                            \n"
+      // Second round of bit swap.
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "palignr     $0x8,%%xmm2,%%xmm2            \n"
+      "palignr     $0x8,%%xmm3,%%xmm3            \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "movdqa      %%xmm5,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "punpcklwd   %%xmm10,%%xmm8                \n"
+      "punpcklwd   %%xmm11,%%xmm9                \n"
+      "movdqa      %%xmm8,%%xmm10                \n"
+      "movdqa      %%xmm9,%%xmm11                \n"
+      "palignr     $0x8,%%xmm10,%%xmm10          \n"
+      "palignr     $0x8,%%xmm11,%%xmm11          \n"
+      "punpcklwd   %%xmm14,%%xmm12               \n"
+      "punpcklwd   %%xmm15,%%xmm13               \n"
+      "movdqa      %%xmm12,%%xmm14               \n"
+      "movdqa      %%xmm13,%%xmm15               \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "palignr     $0x8,%%xmm4,%%xmm4            \n"
+      "movq        %%xmm4,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "movq        %%xmm2,(%1)                   \n"
+      "palignr     $0x8,%%xmm6,%%xmm6            \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movq        %%xmm6,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "movq        %%xmm1,(%1)                   \n"
+      "palignr     $0x8,%%xmm5,%%xmm5            \n"
+      "movq        %%xmm5,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movq        %%xmm3,(%1)                   \n"
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "palignr     $0x8,%%xmm7,%%xmm7            \n"
+      "movq        %%xmm7,(%1,%4)                \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm12,%%xmm8                \n"
+      "movq        %%xmm8,(%1)                   \n"
+      "movdqa      %%xmm8,%%xmm12                \n"
+      "palignr     $0x8,%%xmm12,%%xmm12          \n"
+      "movq        %%xmm12,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm14,%%xmm10               \n"
+      "movdqa      %%xmm10,%%xmm14               \n"
+      "movq        %%xmm10,(%1)                  \n"
+      "palignr     $0x8,%%xmm14,%%xmm14          \n"
+      "punpckldq   %%xmm13,%%xmm9                \n"
+      "movq        %%xmm14,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "movdqa      %%xmm9,%%xmm13                \n"
+      "movq        %%xmm9,(%1)                   \n"
+      "palignr     $0x8,%%xmm13,%%xmm13          \n"
+      "movq        %%xmm13,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "punpckldq   %%xmm15,%%xmm11               \n"
+      "movq        %%xmm11,(%1)                  \n"
+      "movdqa      %%xmm11,%%xmm15               \n"
+      "palignr     $0x8,%%xmm15,%%xmm15          \n"
+      "sub         $0x10,%2                      \n"
+      "movq        %%xmm15,(%1,%4)               \n"
+      "lea         (%1,%4,2),%1                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
+}
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%0,%4),%%xmm1                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      (%0,%4),%%xmm3                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpcklbw   %%xmm3,%%xmm2                 \n"
+      "punpckhbw   %%xmm3,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm3                 \n"
+      "movdqu      (%0),%%xmm4                   \n"
+      "movdqu      (%0,%4),%%xmm5                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "punpcklbw   %%xmm5,%%xmm4                 \n"
+      "punpckhbw   %%xmm5,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      (%0,%4),%%xmm7                \n"
+      "lea         (%0,%4,2),%0                  \n"
+      "movdqa      %%xmm6,%%xmm8                 \n"
+      "punpcklbw   %%xmm7,%%xmm6                 \n"
+      "neg         %4                            \n"
+      "lea         0x10(%0,%4,8),%0              \n"
+      "punpckhbw   %%xmm7,%%xmm8                 \n"
+      "movdqa      %%xmm8,%%xmm7                 \n"
+      "neg         %4                            \n"
+      // Second round of bit swap.
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "movdqa      %%xmm1,%%xmm9                 \n"
+      "punpckhwd   %%xmm2,%%xmm8                 \n"
+      "punpckhwd   %%xmm3,%%xmm9                 \n"
+      "punpcklwd   %%xmm2,%%xmm0                 \n"
+      "punpcklwd   %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm8,%%xmm2                 \n"
+      "movdqa      %%xmm9,%%xmm3                 \n"
+      "movdqa      %%xmm4,%%xmm8                 \n"
+      "movdqa      %%xmm5,%%xmm9                 \n"
+      "punpckhwd   %%xmm6,%%xmm8                 \n"
+      "punpckhwd   %%xmm7,%%xmm9                 \n"
+      "punpcklwd   %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm7,%%xmm5                 \n"
+      "movdqa      %%xmm8,%%xmm6                 \n"
+      "movdqa      %%xmm9,%%xmm7                 \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa      %%xmm0,%%xmm8                 \n"
+      "punpckldq   %%xmm4,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
+      "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
+      "punpckhdq   %%xmm4,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm2,%%xmm8                 \n"
+      "punpckldq   %%xmm6,%%xmm2                 \n"
+      "movlpd      %%xmm2,(%1)                   \n"
+      "movhpd      %%xmm2,(%2)                   \n"
+      "punpckhdq   %%xmm6,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm1,%%xmm8                 \n"
+      "punpckldq   %%xmm5,%%xmm1                 \n"
+      "movlpd      %%xmm1,(%1)                   \n"
+      "movhpd      %%xmm1,(%2)                   \n"
+      "punpckhdq   %%xmm5,%%xmm8                 \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "movdqa      %%xmm3,%%xmm8                 \n"
+      "punpckldq   %%xmm7,%%xmm3                 \n"
+      "movlpd      %%xmm3,(%1)                   \n"
+      "movhpd      %%xmm3,(%2)                   \n"
+      "punpckhdq   %%xmm7,%%xmm8                 \n"
+      "sub         $0x8,%3                       \n"
+      "movlpd      %%xmm8,(%1,%5)                \n"
+      "lea         (%1,%5,2),%1                  \n"
+      "movhpd      %%xmm8,(%2,%6)                \n"
+      "lea         (%2,%6,2),%2                  \n"
+      "jg          1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_SSE2)
+// 4 values, little endian view
+// a b c d
+// e f g h
+// i j k l
+// m n o p
+
+// transpose 2x2
+// a e b f   from row 0, 1
+// i m j n   from row 2, 3
+// c g d h   from row 0, 1
+// k o l p   from row 2, 3
+
+// transpose 4x4
+// a e i m   from row 0, 1
+// b f j n   from row 0, 1
+// c g k o   from row 2, 3
+// d h l p   from row 2, 3
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // a b c d
+      "movdqu      (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "movdqu      (%0),%%xmm2                   \n"  // i j k l
+      "movdqu      (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpckldq   %%xmm1,%%xmm4                 \n"  // a e b f   from row 0, 1
+      "punpckldq   %%xmm3,%%xmm5                 \n"  // i m j n   from row 2, 3
+      "punpckhdq   %%xmm1,%%xmm6                 \n"  // c g d h   from row 0, 1
+      "punpckhdq   %%xmm3,%%xmm7                 \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "movdqa      %%xmm4,%%xmm0                 \n"
+      "movdqa      %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklqdq  %%xmm5,%%xmm0                 \n"  // a e i m   from row 0, 1
+      "punpckhqdq  %%xmm5,%%xmm1                 \n"  // b f j n   from row 0, 1
+      "punpcklqdq  %%xmm7,%%xmm2                 \n"  // c g k o   from row 2, 3
+      "punpckhqdq  %%xmm7,%%xmm3                 \n"  // d h l p   from row 2, 3
+
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         16(%1,%4),%1                  \n"  // dst += stride + 16
+      "movdqu      %%xmm1,-16(%1)                \n"
+      "movdqu      %%xmm2,-16(%1,%4)             \n"
+      "movdqu      %%xmm3,-16(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
+      "vmovdqu     (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vmovdqu     (%0),%%xmm2                   \n"  // i j k l
+      "vmovdqu     (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // a b c d
+      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vinserti128 $1,(%0),%%ymm2,%%ymm2         \n"  // i j k l
+      "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3      \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm4          \n"  // a e b f   from row 0, 1
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm5          \n"  // i m j n   from row 2, 3
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm6          \n"  // c g d h   from row 0, 1
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm7          \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0          \n"  // a e i m   from row 0, 1
+      "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1          \n"  // b f j n   from row 0, 1
+      "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2          \n"  // c g k o   from row 2, 3
+      "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3          \n"  // d h l p   from row 2, 3
+
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         32(%1,%4),%1                  \n"  // dst += stride + 32
+      "vmovdqu     %%ymm1,-32(%1)                \n"
+      "vmovdqu     %%ymm2,-32(%1,%4)             \n"
+      "vmovdqu     %%ymm3,-32(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/rotate_lsx.cc b/source/rotate_lsx.cc
new file mode 100644
index 00000000..94a2b91c
--- /dev/null
+++ b/source/rotate_lsx.cc
@@ -0,0 +1,243 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3)   \
+  {                                                           \
+    DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
+  }
+
+#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3)   \
+  {                                                           \
+    DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
+  }
+
+#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3)   \
+  {                                                           \
+    DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
+  }
+
+#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3)   \
+  {                                                           \
+    DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
+  }
+
+#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
+                 _stride3, _stride4)                                  \
+  {                                                                   \
+    __lsx_vst(_dst0, _dst, 0);                                        \
+    __lsx_vstx(_dst1, _dst, _stride);                                 \
+    __lsx_vstx(_dst2, _dst, _stride2);                                \
+    __lsx_vstx(_dst3, _dst, _stride3);                                \
+    _dst += _stride4;                                                 \
+  }
+
+#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
+  {                                                     \
+    __lsx_vst(_dst0, _dst, 0);                          \
+    __lsx_vstx(_dst1, _dst, _stride);                   \
+    _dst += _stride2;                                   \
+  }
+
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width) {
+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_LSX(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  int x;
+  int len = width / 16;
+  uint8_t* s;
+  int src_stride2 = src_stride << 1;
+  int src_stride3 = src_stride + src_stride2;
+  int src_stride4 = src_stride2 << 1;
+  int dst_stride2 = dst_stride << 1;
+  int dst_stride3 = dst_stride + dst_stride2;
+  int dst_stride4 = dst_stride2 << 1;
+  __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < len; x++) {
+    s = (uint8_t*)src;
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+    ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+    res8 = __lsx_vilvl_w(reg4, reg0);
+    res9 = __lsx_vilvh_w(reg4, reg0);
+    ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+             dst_stride4);
+    res8 = __lsx_vilvl_w(reg5, reg1);
+    res9 = __lsx_vilvh_w(reg5, reg1);
+    ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+             dst_stride4);
+    res8 = __lsx_vilvl_w(reg6, reg2);
+    res9 = __lsx_vilvh_w(reg6, reg2);
+    ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+             dst_stride4);
+    res8 = __lsx_vilvl_w(reg7, reg3);
+    res9 = __lsx_vilvh_w(reg7, reg3);
+    ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+             dst_stride4);
+    src += 16;
+  }
+}
+
+void TransposeUVWx16_LSX(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int x;
+  int len = width / 8;
+  uint8_t* s;
+  int src_stride2 = src_stride << 1;
+  int src_stride3 = src_stride + src_stride2;
+  int src_stride4 = src_stride2 << 1;
+  int dst_stride_a2 = dst_stride_a << 1;
+  int dst_stride_b2 = dst_stride_b << 1;
+  __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < len; x++) {
+    s = (uint8_t*)src;
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+    ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+    src0 = __lsx_vld(s, 0);
+    src1 = __lsx_vldx(s, src_stride);
+    src2 = __lsx_vldx(s, src_stride2);
+    src3 = __lsx_vldx(s, src_stride3);
+    s += src_stride4;
+    ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+    ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+    res8 = __lsx_vilvl_w(reg4, reg0);
+    res9 = __lsx_vilvh_w(reg4, reg0);
+    ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+    LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+    res8 = __lsx_vilvl_w(reg5, reg1);
+    res9 = __lsx_vilvh_w(reg5, reg1);
+    ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+    LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+    res8 = __lsx_vilvl_w(reg6, reg2);
+    res9 = __lsx_vilvh_w(reg6, reg2);
+    ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+    LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+    res8 = __lsx_vilvl_w(reg7, reg3);
+    res9 = __lsx_vilvh_w(reg7, reg3);
+    ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+    LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+    src += 16;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/rotate_msa.cc b/source/rotate_msa.cc
index 99bdca65..99bdca65 100644
--- a/files/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
diff --git a/files/source/rotate_neon.cc b/source/rotate_neon.cc
index fdc0dd47..569a7318 100644
--- a/files/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      "vld1.8      {d0}, [%0], %2              \n"
-      "vld1.8      {d1}, [%0], %2              \n"
-      "vld1.8      {d2}, [%0], %2              \n"
-      "vld1.8      {d3}, [%0], %2              \n"
-      "vld1.8      {d4}, [%0], %2              \n"
-      "vld1.8      {d5}, [%0], %2              \n"
-      "vld1.8      {d6}, [%0], %2              \n"
-      "vld1.8      {d7}, [%0]                  \n"
-
-      "vtrn.8      d1, d0                      \n"
-      "vtrn.8      d3, d2                      \n"
-      "vtrn.8      d5, d4                      \n"
-      "vtrn.8      d7, d6                      \n"
-
-      "vtrn.16     d1, d3                      \n"
-      "vtrn.16     d0, d2                      \n"
-      "vtrn.16     d5, d7                      \n"
-      "vtrn.16     d4, d6                      \n"
-
-      "vtrn.32     d1, d5                      \n"
-      "vtrn.32     d0, d4                      \n"
-      "vtrn.32     d3, d7                      \n"
-      "vtrn.32     d2, d6                      \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-
-      "mov         %0, %3                      \n"
-
-      "vst1.8      {d1}, [%0], %4              \n"
-      "vst1.8      {d0}, [%0], %4              \n"
-      "vst1.8      {d3}, [%0], %4              \n"
-      "vst1.8      {d2}, [%0], %4              \n"
-      "vst1.8      {d5}, [%0], %4              \n"
-      "vst1.8      {d4}, [%0], %4              \n"
-      "vst1.8      {d7}, [%0], %4              \n"
-      "vst1.8      {d6}, [%0]                  \n"
-
-      "add         %1, #8                      \n"  // src += 8
-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %5,  #8                     \n"  // w   -= 8
-      "bge         1b                          \n"
+      "mov         %0, %1                        \n"
+
+      "vld1.8      {d0}, [%0], %2                \n"
+      "vld1.8      {d1}, [%0], %2                \n"
+      "vld1.8      {d2}, [%0], %2                \n"
+      "vld1.8      {d3}, [%0], %2                \n"
+      "vld1.8      {d4}, [%0], %2                \n"
+      "vld1.8      {d5}, [%0], %2                \n"
+      "vld1.8      {d6}, [%0], %2                \n"
+      "vld1.8      {d7}, [%0]                    \n"
+
+      "vtrn.8      d1, d0                        \n"
+      "vtrn.8      d3, d2                        \n"
+      "vtrn.8      d5, d4                        \n"
+      "vtrn.8      d7, d6                        \n"
+
+      "vtrn.16     d1, d3                        \n"
+      "vtrn.16     d0, d2                        \n"
+      "vtrn.16     d5, d7                        \n"
+      "vtrn.16     d4, d6                        \n"
+
+      "vtrn.32     d1, d5                        \n"
+      "vtrn.32     d0, d4                        \n"
+      "vtrn.32     d3, d7                        \n"
+      "vtrn.32     d2, d6                        \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d1}, [%0], %4                \n"
+      "vst1.8      {d0}, [%0], %4                \n"
+      "vst1.8      {d3}, [%0], %4                \n"
+      "vst1.8      {d2}, [%0], %4                \n"
+      "vst1.8      {d5}, [%0], %4                \n"
+      "vst1.8      {d4}, [%0], %4                \n"
+      "vst1.8      {d7}, [%0], %4                \n"
+      "vst1.8      {d6}, [%0]                    \n"
+
+      "add         %1, #8                        \n"  // src += 8
+      "add         %3, %3, %4, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                       \n"  // w   -= 8
+      "bge         1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov         %0, %1                      \n"
-
-      "vld2.8      {d0,  d1},  [%0], %2        \n"
-      "vld2.8      {d2,  d3},  [%0], %2        \n"
-      "vld2.8      {d4,  d5},  [%0], %2        \n"
-      "vld2.8      {d6,  d7},  [%0], %2        \n"
-      "vld2.8      {d16, d17}, [%0], %2        \n"
-      "vld2.8      {d18, d19}, [%0], %2        \n"
-      "vld2.8      {d20, d21}, [%0], %2        \n"
-      "vld2.8      {d22, d23}, [%0]            \n"
-
-      "vtrn.8      q1, q0                      \n"
-      "vtrn.8      q3, q2                      \n"
-      "vtrn.8      q9, q8                      \n"
-      "vtrn.8      q11, q10                    \n"
-
-      "vtrn.16     q1, q3                      \n"
-      "vtrn.16     q0, q2                      \n"
-      "vtrn.16     q9, q11                     \n"
-      "vtrn.16     q8, q10                     \n"
-
-      "vtrn.32     q1, q9                      \n"
-      "vtrn.32     q0, q8                      \n"
-      "vtrn.32     q3, q11                     \n"
-      "vtrn.32     q2, q10                     \n"
-
-      "vrev16.8    q0, q0                      \n"
-      "vrev16.8    q1, q1                      \n"
-      "vrev16.8    q2, q2                      \n"
-      "vrev16.8    q3, q3                      \n"
-      "vrev16.8    q8, q8                      \n"
-      "vrev16.8    q9, q9                      \n"
-      "vrev16.8    q10, q10                    \n"
-      "vrev16.8    q11, q11                    \n"
-
-      "mov         %0, %3                      \n"
-
-      "vst1.8      {d2},  [%0], %4             \n"
-      "vst1.8      {d0},  [%0], %4             \n"
-      "vst1.8      {d6},  [%0], %4             \n"
-      "vst1.8      {d4},  [%0], %4             \n"
-      "vst1.8      {d18}, [%0], %4             \n"
-      "vst1.8      {d16}, [%0], %4             \n"
-      "vst1.8      {d22}, [%0], %4             \n"
-      "vst1.8      {d20}, [%0]                 \n"
-
-      "mov         %0, %5                      \n"
-
-      "vst1.8      {d3},  [%0], %6             \n"
-      "vst1.8      {d1},  [%0], %6             \n"
-      "vst1.8      {d7},  [%0], %6             \n"
-      "vst1.8      {d5},  [%0], %6             \n"
-      "vst1.8      {d19}, [%0], %6             \n"
-      "vst1.8      {d17}, [%0], %6             \n"
-      "vst1.8      {d23}, [%0], %6             \n"
-      "vst1.8      {d21}, [%0]                 \n"
-
-      "add         %1, #8*2                    \n"  // src   += 8*2
-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %7,  #8                     \n"  // w     -= 8
-      "bge         1b                          \n"
+      "mov         %0, %1                        \n"
+
+      "vld2.8      {d0,  d1},  [%0], %2          \n"
+      "vld2.8      {d2,  d3},  [%0], %2          \n"
+      "vld2.8      {d4,  d5},  [%0], %2          \n"
+      "vld2.8      {d6,  d7},  [%0], %2          \n"
+      "vld2.8      {d16, d17}, [%0], %2          \n"
+      "vld2.8      {d18, d19}, [%0], %2          \n"
+      "vld2.8      {d20, d21}, [%0], %2          \n"
+      "vld2.8      {d22, d23}, [%0]              \n"
+
+      "vtrn.8      q1, q0                        \n"
+      "vtrn.8      q3, q2                        \n"
+      "vtrn.8      q9, q8                        \n"
+      "vtrn.8      q11, q10                      \n"
+
+      "vtrn.16     q1, q3                        \n"
+      "vtrn.16     q0, q2                        \n"
+      "vtrn.16     q9, q11                       \n"
+      "vtrn.16     q8, q10                       \n"
+
+      "vtrn.32     q1, q9                        \n"
+      "vtrn.32     q0, q8                        \n"
+      "vtrn.32     q3, q11                       \n"
+      "vtrn.32     q2, q10                       \n"
+
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+      "vrev16.8    q8, q8                        \n"
+      "vrev16.8    q9, q9                        \n"
+      "vrev16.8    q10, q10                      \n"
+      "vrev16.8    q11, q11                      \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.8      {d2},  [%0], %4               \n"
+      "vst1.8      {d0},  [%0], %4               \n"
+      "vst1.8      {d6},  [%0], %4               \n"
+      "vst1.8      {d4},  [%0], %4               \n"
+      "vst1.8      {d18}, [%0], %4               \n"
+      "vst1.8      {d16}, [%0], %4               \n"
+      "vst1.8      {d22}, [%0], %4               \n"
+      "vst1.8      {d20}, [%0]                   \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.8      {d3},  [%0], %6               \n"
+      "vst1.8      {d1},  [%0], %6               \n"
+      "vst1.8      {d7},  [%0], %6               \n"
+      "vst1.8      {d5},  [%0], %6               \n"
+      "vst1.8      {d19}, [%0], %6               \n"
+      "vst1.8      {d17}, [%0], %6               \n"
+      "vst1.8      {d23}, [%0], %6               \n"
+      "vst1.8      {d21}, [%0]                   \n"
+
+      "add         %1, #8*2                      \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3            \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #3            \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs        %7,  #8                       \n"  // w     -= 8
+      "bge         1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
@@ -408,6 +410,46 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       : "r"(&kVTbl4x4TransposeDi)  // %8
       : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
+      "vld4.32     {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
+      "vld4.32     {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
+      "vld4.32     {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
+      "subs        %8, %8, #4                    \n"  // w -= 4
+      "vst1.8      {q0}, [%4]!                   \n"
+      "vst1.8      {q1}, [%5]!                   \n"
+      "vst1.8      {q2}, [%6]!                   \n"
+      "vst1.8      {q3}, [%7]!                   \n"
+      "bgt         1b                            \n"
+
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_neon64.cc b/source/rotate_neon64.cc
index f469baac..95047fa7 100644
--- a/files/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub         %w3, %w3, #8                     \n"
+      "sub         %w3, %w3, #8                  \n"
 
       // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                          \n"
+      "1:                                        \n"
+      "mov         %0, %1                        \n"
+
+      "ld1         {v0.8b}, [%0], %5             \n"
+      "ld1         {v1.8b}, [%0], %5             \n"
+      "ld1         {v2.8b}, [%0], %5             \n"
+      "ld1         {v3.8b}, [%0], %5             \n"
+      "ld1         {v4.8b}, [%0], %5             \n"
+      "ld1         {v5.8b}, [%0], %5             \n"
+      "ld1         {v6.8b}, [%0], %5             \n"
+      "ld1         {v7.8b}, [%0]                 \n"
       "mov         %0, %1                        \n"
 
-      "ld1        {v0.8b}, [%0], %5              \n"
-      "ld1        {v1.8b}, [%0], %5              \n"
-      "ld1        {v2.8b}, [%0], %5              \n"
-      "ld1        {v3.8b}, [%0], %5              \n"
-      "ld1        {v4.8b}, [%0], %5              \n"
-      "ld1        {v5.8b}, [%0], %5              \n"
-      "ld1        {v6.8b}, [%0], %5              \n"
-      "ld1        {v7.8b}, [%0]                  \n"
-
-      "trn2     v16.8b, v0.8b, v1.8b             \n"
-      "trn1     v17.8b, v0.8b, v1.8b             \n"
-      "trn2     v18.8b, v2.8b, v3.8b             \n"
-      "trn1     v19.8b, v2.8b, v3.8b             \n"
-      "trn2     v20.8b, v4.8b, v5.8b             \n"
-      "trn1     v21.8b, v4.8b, v5.8b             \n"
-      "trn2     v22.8b, v6.8b, v7.8b             \n"
-      "trn1     v23.8b, v6.8b, v7.8b             \n"
-
-      "trn2     v3.4h, v17.4h, v19.4h            \n"
-      "trn1     v1.4h, v17.4h, v19.4h            \n"
-      "trn2     v2.4h, v16.4h, v18.4h            \n"
-      "trn1     v0.4h, v16.4h, v18.4h            \n"
-      "trn2     v7.4h, v21.4h, v23.4h            \n"
-      "trn1     v5.4h, v21.4h, v23.4h            \n"
-      "trn2     v6.4h, v20.4h, v22.4h            \n"
-      "trn1     v4.4h, v20.4h, v22.4h            \n"
-
-      "trn2     v21.2s, v1.2s, v5.2s             \n"
-      "trn1     v17.2s, v1.2s, v5.2s             \n"
-      "trn2     v20.2s, v0.2s, v4.2s             \n"
-      "trn1     v16.2s, v0.2s, v4.2s             \n"
-      "trn2     v23.2s, v3.2s, v7.2s             \n"
-      "trn1     v19.2s, v3.2s, v7.2s             \n"
-      "trn2     v22.2s, v2.2s, v6.2s             \n"
-      "trn1     v18.2s, v2.2s, v6.2s             \n"
+      "trn2        v16.8b, v0.8b, v1.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "trn1        v17.8b, v0.8b, v1.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v18.8b, v2.8b, v3.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 1
+      "trn1        v19.8b, v2.8b, v3.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v20.8b, v4.8b, v5.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 2
+      "trn1        v21.8b, v4.8b, v5.8b          \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v22.8b, v6.8b, v7.8b          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 3
+      "trn1        v23.8b, v6.8b, v7.8b          \n"
+      "add         %0, %0, %5                    \n"
+
+      "trn2        v3.4h, v17.4h, v19.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 4
+      "trn1        v1.4h, v17.4h, v19.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v2.4h, v16.4h, v18.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 5
+      "trn1        v0.4h, v16.4h, v18.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v7.4h, v21.4h, v23.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 6
+      "trn1        v5.4h, v21.4h, v23.4h         \n"
+      "add         %0, %0, %5                    \n"
+      "trn2        v6.4h, v20.4h, v22.4h         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // row 7
+      "trn1        v4.4h, v20.4h, v22.4h         \n"
+
+      "trn2        v21.2s, v1.2s, v5.2s          \n"
+      "trn1        v17.2s, v1.2s, v5.2s          \n"
+      "trn2        v20.2s, v0.2s, v4.2s          \n"
+      "trn1        v16.2s, v0.2s, v4.2s          \n"
+      "trn2        v23.2s, v3.2s, v7.2s          \n"
+      "trn1        v19.2s, v3.2s, v7.2s          \n"
+      "trn2        v22.2s, v2.2s, v6.2s          \n"
+      "trn1        v18.2s, v2.2s, v6.2s          \n"
 
       "mov         %0, %2                        \n"
 
-      "st1      {v17.8b}, [%0], %6               \n"
-      "st1      {v16.8b}, [%0], %6               \n"
-      "st1      {v19.8b}, [%0], %6               \n"
-      "st1      {v18.8b}, [%0], %6               \n"
-      "st1      {v21.8b}, [%0], %6               \n"
-      "st1      {v20.8b}, [%0], %6               \n"
-      "st1      {v23.8b}, [%0], %6               \n"
-      "st1      {v22.8b}, [%0]                   \n"
+      "st1         {v17.8b}, [%0], %6            \n"
+      "st1         {v16.8b}, [%0], %6            \n"
+      "st1         {v19.8b}, [%0], %6            \n"
+      "st1         {v18.8b}, [%0], %6            \n"
+      "st1         {v21.8b}, [%0], %6            \n"
+      "st1         {v20.8b}, [%0], %6            \n"
+      "st1         {v23.8b}, [%0], %6            \n"
+      "st1         {v22.8b}, [%0]                \n"
 
       "add         %1, %1, #8                    \n"  // src += 8
       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
-      "adds        %w3, %w3, #8                    \n"
-      "b.eq        4f                              \n"
+      "adds        %w3, %w3, #8                  \n"
+      "b.eq        4f                            \n"
 
       // some residual, so between 1 and 7 lines left to transpose
-      "cmp         %w3, #2                          \n"
-      "b.lt        3f                              \n"
+      "cmp         %w3, #2                       \n"
+      "b.lt        3f                            \n"
 
-      "cmp         %w3, #4                          \n"
-      "b.lt        2f                              \n"
+      "cmp         %w3, #4                       \n"
+      "b.lt        2f                            \n"
 
       // 4x8 block
-      "mov         %0, %1                          \n"
-      "ld1     {v0.s}[0], [%0], %5                 \n"
-      "ld1     {v0.s}[1], [%0], %5                 \n"
-      "ld1     {v0.s}[2], [%0], %5                 \n"
-      "ld1     {v0.s}[3], [%0], %5                 \n"
-      "ld1     {v1.s}[0], [%0], %5                 \n"
-      "ld1     {v1.s}[1], [%0], %5                 \n"
-      "ld1     {v1.s}[2], [%0], %5                 \n"
-      "ld1     {v1.s}[3], [%0]                     \n"
+      "mov         %0, %1                        \n"
+      "ld1         {v0.s}[0], [%0], %5           \n"
+      "ld1         {v0.s}[1], [%0], %5           \n"
+      "ld1         {v0.s}[2], [%0], %5           \n"
+      "ld1         {v0.s}[3], [%0], %5           \n"
+      "ld1         {v1.s}[0], [%0], %5           \n"
+      "ld1         {v1.s}[1], [%0], %5           \n"
+      "ld1         {v1.s}[2], [%0], %5           \n"
+      "ld1         {v1.s}[3], [%0]               \n"
 
-      "mov         %0, %2                          \n"
+      "mov         %0, %2                        \n"
 
-      "ld1      {v2.16b}, [%4]                     \n"
+      "ld1         {v2.16b}, [%4]                \n"
 
-      "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-      "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+      "tbl         v3.16b, {v0.16b}, v2.16b      \n"
+      "tbl         v0.16b, {v1.16b}, v2.16b      \n"
 
       // TODO(frkoenig): Rework shuffle above to
       // write out with 4 instead of 8 writes.
@@ -185,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       "4:                                          \n"
 
-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "=&r"(src_temp),             // %0
+        "+r"(src),                   // %1
+        "+r"(dst),                   // %2
+        "+r"(width)                  // %3
+      : "r"(&kVTbl4x4Transpose),     // %4
+        "r"((ptrdiff_t)src_stride),  // %5
+        "r"((ptrdiff_t)dst_stride)   // %6
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub       %w4, %w4, #8                    \n"
+      "sub         %w4, %w4, #8                  \n"
 
       // handle 8x8 blocks. this should be the majority of the plane
       "1:                                        \n"
-      "mov       %0, %1                          \n"
-
-      "ld1       {v0.16b}, [%0], %5              \n"
-      "ld1       {v1.16b}, [%0], %5              \n"
-      "ld1       {v2.16b}, [%0], %5              \n"
-      "ld1       {v3.16b}, [%0], %5              \n"
-      "ld1       {v4.16b}, [%0], %5              \n"
-      "ld1       {v5.16b}, [%0], %5              \n"
-      "ld1       {v6.16b}, [%0], %5              \n"
-      "ld1       {v7.16b}, [%0]                  \n"
-
-      "trn1      v16.16b, v0.16b, v1.16b         \n"
-      "trn2      v17.16b, v0.16b, v1.16b         \n"
-      "trn1      v18.16b, v2.16b, v3.16b         \n"
-      "trn2      v19.16b, v2.16b, v3.16b         \n"
-      "trn1      v20.16b, v4.16b, v5.16b         \n"
-      "trn2      v21.16b, v4.16b, v5.16b         \n"
-      "trn1      v22.16b, v6.16b, v7.16b         \n"
-      "trn2      v23.16b, v6.16b, v7.16b         \n"
-
-      "trn1      v0.8h, v16.8h, v18.8h           \n"
-      "trn2      v1.8h, v16.8h, v18.8h           \n"
-      "trn1      v2.8h, v20.8h, v22.8h           \n"
-      "trn2      v3.8h, v20.8h, v22.8h           \n"
-      "trn1      v4.8h, v17.8h, v19.8h           \n"
-      "trn2      v5.8h, v17.8h, v19.8h           \n"
-      "trn1      v6.8h, v21.8h, v23.8h           \n"
-      "trn2      v7.8h, v21.8h, v23.8h           \n"
-
-      "trn1      v16.4s, v0.4s, v2.4s            \n"
-      "trn2      v17.4s, v0.4s, v2.4s            \n"
-      "trn1      v18.4s, v1.4s, v3.4s            \n"
-      "trn2      v19.4s, v1.4s, v3.4s            \n"
-      "trn1      v20.4s, v4.4s, v6.4s            \n"
-      "trn2      v21.4s, v4.4s, v6.4s            \n"
-      "trn1      v22.4s, v5.4s, v7.4s            \n"
-      "trn2      v23.4s, v5.4s, v7.4s            \n"
+      "mov         %0, %1                        \n"
 
-      "mov       %0, %2                          \n"
+      "ld1         {v0.16b}, [%0], %5            \n"
+      "ld1         {v1.16b}, [%0], %5            \n"
+      "ld1         {v2.16b}, [%0], %5            \n"
+      "ld1         {v3.16b}, [%0], %5            \n"
+      "ld1         {v4.16b}, [%0], %5            \n"
+      "ld1         {v5.16b}, [%0], %5            \n"
+      "ld1         {v6.16b}, [%0], %5            \n"
+      "ld1         {v7.16b}, [%0]                \n"
+      "mov         %0, %1                        \n"
 
-      "st1       {v16.d}[0], [%0], %6            \n"
-      "st1       {v18.d}[0], [%0], %6            \n"
-      "st1       {v17.d}[0], [%0], %6            \n"
-      "st1       {v19.d}[0], [%0], %6            \n"
-      "st1       {v16.d}[1], [%0], %6            \n"
-      "st1       {v18.d}[1], [%0], %6            \n"
-      "st1       {v17.d}[1], [%0], %6            \n"
-      "st1       {v19.d}[1], [%0]                \n"
+      "trn1        v16.16b, v0.16b, v1.16b       \n"
+      "trn2        v17.16b, v0.16b, v1.16b       \n"
+      "trn1        v18.16b, v2.16b, v3.16b       \n"
+      "trn2        v19.16b, v2.16b, v3.16b       \n"
+      "trn1        v20.16b, v4.16b, v5.16b       \n"
+      "trn2        v21.16b, v4.16b, v5.16b       \n"
+      "trn1        v22.16b, v6.16b, v7.16b       \n"
+      "trn2        v23.16b, v6.16b, v7.16b       \n"
+
+      "trn1        v0.8h, v16.8h, v18.8h         \n"
+      "trn2        v1.8h, v16.8h, v18.8h         \n"
+      "trn1        v2.8h, v20.8h, v22.8h         \n"
+      "trn2        v3.8h, v20.8h, v22.8h         \n"
+      "trn1        v4.8h, v17.8h, v19.8h         \n"
+      "trn2        v5.8h, v17.8h, v19.8h         \n"
+      "trn1        v6.8h, v21.8h, v23.8h         \n"
+      "trn2        v7.8h, v21.8h, v23.8h         \n"
+
+      "trn1        v16.4s, v0.4s, v2.4s          \n"
+      "trn2        v17.4s, v0.4s, v2.4s          \n"
+      "trn1        v18.4s, v1.4s, v3.4s          \n"
+      "trn2        v19.4s, v1.4s, v3.4s          \n"
+      "trn1        v20.4s, v4.4s, v6.4s          \n"
+      "trn2        v21.4s, v4.4s, v6.4s          \n"
+      "trn1        v22.4s, v5.4s, v7.4s          \n"
+      "trn2        v23.4s, v5.4s, v7.4s          \n"
 
-      "mov       %0, %3                          \n"
+      "mov         %0, %2                        \n"
 
-      "st1       {v20.d}[0], [%0], %7            \n"
-      "st1       {v22.d}[0], [%0], %7            \n"
-      "st1       {v21.d}[0], [%0], %7            \n"
-      "st1       {v23.d}[0], [%0], %7            \n"
-      "st1       {v20.d}[1], [%0], %7            \n"
-      "st1       {v22.d}[1], [%0], %7            \n"
-      "st1       {v21.d}[1], [%0], %7            \n"
-      "st1       {v23.d}[1], [%0]                \n"
-
-      "add       %1, %1, #16                     \n"  // src   += 8*2
-      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *
+      "st1         {v16.d}[0], [%0], %6          \n"
+      "st1         {v18.d}[0], [%0], %6          \n"
+      "st1         {v17.d}[0], [%0], %6          \n"
+      "st1         {v19.d}[0], [%0], %6          \n"
+      "st1         {v16.d}[1], [%0], %6          \n"
+      "st1         {v18.d}[1], [%0], %6          \n"
+      "st1         {v17.d}[1], [%0], %6          \n"
+      "st1         {v19.d}[1], [%0]              \n"
+
+      "mov         %0, %3                        \n"
+
+      "st1         {v20.d}[0], [%0], %7          \n"
+      "st1         {v22.d}[0], [%0], %7          \n"
+      "st1         {v21.d}[0], [%0], %7          \n"
+      "st1         {v23.d}[0], [%0], %7          \n"
+      "st1         {v20.d}[1], [%0], %7          \n"
+      "st1         {v22.d}[1], [%0], %7          \n"
+      "st1         {v21.d}[1], [%0], %7          \n"
+      "st1         {v23.d}[1], [%0]              \n"
+
+      "add         %1, %1, #16                   \n"  // src   += 8*2
+      "add         %2, %2, %6, lsl #3            \n"  // dst_a += 8 *
                                                       // dst_stride_a
-      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *
+      "add         %3, %3, %7, lsl #3            \n"  // dst_b += 8 *
                                                       // dst_stride_b
-      "subs      %w4, %w4,  #8                   \n"  // w     -= 8
-      "b.ge      1b                              \n"
+      "subs        %w4, %w4,  #8                 \n"  // w     -= 8
+      "b.ge        1b                            \n"
 
       // add 8 back to counter. if the result is 0 there are
       // no residuals.
-      "adds      %w4, %w4, #8                    \n"
-      "b.eq      4f                              \n"
+      "adds        %w4, %w4, #8                  \n"
+      "b.eq        4f                            \n"
 
       // some residual, so between 1 and 7 lines left to transpose
-      "cmp       %w4, #2                         \n"
-      "b.lt      3f                              \n"
+      "cmp         %w4, #2                       \n"
+      "b.lt        3f                            \n"
 
-      "cmp       %w4, #4                         \n"
-      "b.lt      2f                              \n"
+      "cmp         %w4, #4                       \n"
+      "b.lt        2f                            \n"
 
       // TODO(frkoenig): Clean this up
       // 4x8 block
@@ -406,18 +423,57 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       "4:                                        \n"
 
-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "=&r"(src_temp),               // %0
+        "+r"(src),                     // %1
+        "+r"(dst_a),                   // %2
+        "+r"(dst_b),                   // %3
+        "+r"(width)                    // %4
+      : "r"((ptrdiff_t)src_stride),    // %5
+        "r"((ptrdiff_t)dst_stride_a),  // %6
+        "r"((ptrdiff_t)dst_stride_b),  // %7
+        "r"(&kVTbl4x4TransposeDi)      // %8
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
+      "subs        %w8, %w8, #4                  \n"  // w -= 4
+      "st1         {v0.4s}, [%4], 16             \n"
+      "st1         {v1.4s}, [%5], 16             \n"
+      "st1         {v2.4s}, [%6], 16             \n"
+      "st1         {v3.4s}, [%7], 16             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/files/source/rotate_win.cc b/source/rotate_win.cc
index e887dd52..a78873f8 100644
--- a/files/source/rotate_win.cc
+++ b/source/rotate_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
                                           int src_stride,
diff --git a/source/row_any.cc b/source/row_any.cc
new file mode 100644
index 00000000..e574543c
--- /dev/null
+++ b/source/row_any.cc
@@ -0,0 +1,2459 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// memset for vin is meant to clear the source buffer so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1
+#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)               \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n);                      \
+    }                                                                        \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1);           \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
+  }
+
+#ifdef HAS_MERGEARGBROW_SSE2
+ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEARGBROW_AVX2
+ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEARGBROW_NEON
+ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
+#endif
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
+    SIMD_ALIGNED(uint8_t vin[64 * 4]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(vin, y_buf + n, r);                                               \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(vin + 192, a_buf + n, r);                                         \
+    if (width & 1) {                                                         \
+      vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1];               \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];             \
+    }                                                                        \
+    ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants,        \
+             MASK + 1);                                                      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);    \
+  }
+
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_NEON
+ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_MSA
+ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_LSX
+ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_LASX
+ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
+#endif
+#undef ANY41C
+
+// Any 4 planes to 1 plane of 8 bit with yuvconstants
+#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,      \
+               int width) {                                                    \
+    SIMD_ALIGNED(T vin[16 * 4]);                                               \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                       \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);          \
+    }                                                                          \
+    memcpy(vin, y_buf + n, r * SBPP);                                          \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);           \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                     \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);      \
+  }
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
+        I210AlphaToARGBRow_SSSE3,
+        1,
+        0,
+        uint16_t,
+        2,
+        4,
+        7)
+#endif
+
+#ifdef HAS_I210ALPHATOARGBROW_AVX2
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
+        I210AlphaToARGBRow_AVX2,
+        1,
+        0,
+        uint16_t,
+        2,
+        4,
+        15)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
+        I410AlphaToARGBRow_SSSE3,
+        0,
+        0,
+        uint16_t,
+        2,
+        4,
+        7)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_AVX2
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
+        I410AlphaToARGBRow_AVX2,
+        0,
+        0,
+        uint16_t,
+        2,
+        4,
+        15)
+#endif
+
+#undef ANY41CT
+
+// Any 4 planes to 1 plane with parameter
+#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
+  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+               const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
+    SIMD_ALIGNED(STYPE vin[16 * 4]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n);             \
+    }                                                                      \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    memcpy(vin + 48, a_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1);    \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
+  }
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEAR64ROW_NEON
+ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+ANY41PT(MergeARGB16To8Row_Any_AVX2,
+        MergeARGB16To8Row_AVX2,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        15)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_NEON
+ANY41PT(MergeARGB16To8Row_Any_NEON,
+        MergeARGB16To8Row_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        7)
+#endif
+
+#undef ANY41PT
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) {       \
+    SIMD_ALIGNED(uint8_t vin[64 * 3]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                          \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
+    memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1);                   \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGEXRGBROW_SSE2
+ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEXRGBROW_AVX2
+ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEXRGBROW_NEON
+ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_LSX
+ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_LASX
+ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_LSX
+ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_LASX
+ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)           \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                \
+               const uint8_t* v_buf, uint8_t* dst_ptr,                    \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t vin[128 * 3]);                                   \
+    SIMD_ALIGNED(uint8_t vout[128]);                                      \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r);                                            \
+    memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    if (width & 1) {                                                      \
+      vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1];          \
+      vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1];          \
+    }                                                                     \
+    ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+  }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TORGBAROW_SSSE3
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_SSSE3
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_SSSE3
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_SSSE3
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I444TORGB24ROW_SSSE3
+ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX512BW
+ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I444TORGB24ROW_AVX2
+ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I444TORGB24ROW_NEON
+ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_LSX
+ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15)
+ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15)
+ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15)
+ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_LASX
+ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31)
+ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31)
+#endif
+#ifdef HAS_I444TOARGBROW_LSX
+ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15)
+#endif
+#undef ANY31C
+
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T vin[16 * 3]);                                          \
+    SIMD_ALIGNED(uint8_t vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                  \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(vin, y_buf + n, r * SBPP);                                     \
+    memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);      \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1);      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
+  }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_SSSE3
+ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_SSSE3
+ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_AVX2
+ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_AVX2
+ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_SSSE3
+ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_SSSE3
+ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_AVX2
+ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_AVX2
+ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
+// Any 3 planes to 1 plane with parameter
+#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK)          \
+  void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+               DTYPE* dst_ptr, int depth, int width) {                     \
+    SIMD_ALIGNED(STYPE vin[16 * 3]);                                       \
+    SIMD_ALIGNED(DTYPE vout[64]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                   \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n);                    \
+    }                                                                      \
+    memcpy(vin, r_buf + n, r * SBPP);                                      \
+    memcpy(vin + 16, g_buf + n, r * SBPP);                                 \
+    memcpy(vin + 32, b_buf + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1);              \
+    memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP);                    \
+  }
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
+#endif
+
+#ifdef HAS_MERGEXR30ROW_NEON
+ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
+ANY31PT(MergeXR30Row_10_Any_NEON,
+        MergeXR30Row_10_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        3)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_NEON
+ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+ANY31PT(MergeXRGB16To8Row_Any_AVX2,
+        MergeXRGB16To8Row_AVX2,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        15)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_NEON
+ANY31PT(MergeXRGB16To8Row_Any_NEON,
+        MergeXRGB16To8Row_NEON,
+        uint16_t,
+        2,
+        uint8_t,
+        4,
+        7)
+#endif
+
+#undef ANY31PT
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(vin, vin + 128, vout, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX512BW
+ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_LSX
+ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_SSSE3
+ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_LSX
+ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_LASX
+ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_LSX
+ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_LASX
+ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_LSX
+ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_LASX
+ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_LSX
+ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_LSX
+ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_LSX
+ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with stride
+// width is measured in source pixels. 4 bytes contains 2 pixels
+#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
+               int width) {                                               \
+    SIMD_ALIGNED(uint8_t vin[32 * 2]);                                    \
+    SIMD_ALIGNED(uint8_t vout[32]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                           \
+    int awidth = (width + 1) / 2;                                         \
+    int r = awidth & MASK;                                                \
+    int n = awidth & ~MASK;                                               \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2);                     \
+    }                                                                     \
+    memcpy(vin, src_yuy2 + n * SBPP, r * SBPP);                           \
+    memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP);        \
+    ANY_SIMD(vin, 32, vout, MASK + 1);                                    \
+    memcpy(dst_uv + n * BPP, vout, r * BPP);                              \
+  }
+
+#ifdef HAS_YUY2TONVUVROW_NEON
+ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_SSE2
+ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
+#endif
+#ifdef HAS_YUY2TONVUVROW_AVX2
+ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
+#endif
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                       \
+    SIMD_ALIGNED(uint8_t vout[128]);                                          \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(vin, y_buf + n * SBPP, r * SBPP);                                  \
+    memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                 \
+  }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_LSX
+ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_LASX
+ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_LSX
+ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_LASX
+ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_LSX
+ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_LASX
+ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15)
+#endif
+#undef ANY21C
+
+// Any 2 planes of 16 bit to 1 with yuvconstants
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)     \
+  void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr,             \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(T vin[16 * 2]);                                              \
+    SIMD_ALIGNED(uint8_t vout[64]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                               \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(vin, y_buf + n, r * SBPP);                                         \
+    memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+    ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1);                    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP);     \
+  }
+
+#ifdef HAS_P210TOAR30ROW_SSSE3
+ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_SSSE3
+ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_AVX2
+ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P210TOAR30ROW_AVX2
+ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_SSSE3
+ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_SSSE3
+ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_AVX2
+ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_AVX2
+ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+
+#undef ANY21CT
+
+// Any 2 16 bit planes with parameter to 1
+#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
+  void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
+               int width) {                                          \
+    SIMD_ALIGNED(T vin[16 * 2]);                                     \
+    SIMD_ALIGNED(T vout[16]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(src_u, src_v, dst_uv, depth, n);                      \
+    }                                                                \
+    memcpy(vin, src_u + n, r * BPP);                                 \
+    memcpy(vin + 16, src_v + n, r * BPP);                            \
+    ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1);                  \
+    memcpy(dst_uv + n * 2, vout, r * BPP * 2);                       \
+  }
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
+#endif
+#ifdef HAS_MERGEUVROW_16_NEON
+ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
+  }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_LSX)
+ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15)
+#endif
+#if defined(HAS_J400TOARGBROW_LSX)
+ANY11(J400ToARGBRow_Any_LSX, J400ToARGBRow_LSX, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_LSX)
+ANY11(RAWToRGB24Row_Any_LSX, RAWToRGB24Row_LSX, 0, 3, 3, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYJROW_AVX2
+ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_SSE2
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_SSSE3
+ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_LSX
+ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_LASX
+ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_NEON
+ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_LSX
+ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LSX
+ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYJROW_LSX
+ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_LASX
+ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_LASX
+ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYJROW_LASX
+ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_LSX
+ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_LASX
+ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_LSX
+ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ABGRTOYROW_LASX
+ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_LSX
+ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_LASX
+ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_LSX
+ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LSX
+ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_LASX
+ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYROW_LASX
+ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_LSX
+ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_LASX
+ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_LSX
+ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_LASX
+ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_LSX
+ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_LASX
+ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_LSX
+ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_LASX
+ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_LSX
+ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_LASX
+ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_LSX
+ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_LASX
+ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_LSX
+ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_LASX
+ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_LSX
+ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_LASX
+ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_LSX
+ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_LASX
+ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_LSX
+ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_LASX
+ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_LSX
+ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_LASX
+ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_LSX
+ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_LASX
+ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_LSX
+ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended.  Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[64]);                                       \
+    SIMD_ALIGNED(uint8_t vout[64]);                                      \
+    memset(vin, 0, sizeof(vin));   /* for msan */                        \
+    memset(vout, 0, sizeof(vout)); /* for msan */                        \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(vout, dst_ptr + n * BPP, r * BPP);                            \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
+  }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+    SIMD_ALIGNED(uint8_t vin[64]);                                             \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                                      \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                                  \
+  }
+
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+       I400ToARGBRow_SSE2,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+       I400ToARGBRow_AVX2,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+       I400ToARGBRow_NEON,
+       const struct YuvConstants*,
+       1,
+       4,
+       7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+       I400ToARGBRow_MSA,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+#if defined(HAS_I400TOARGBROW_LSX)
+ANY11P(I400ToARGBRow_Any_LSX,
+       I400ToARGBRow_LSX,
+       const struct YuvConstants*,
+       1,
+       4,
+       15)
+#endif
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LSX)
+ANY11P(ARGBToRGB565DitherRow_Any_LSX,
+       ARGBToRGB565DitherRow_LSX,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ANY11P(ARGBToRGB565DitherRow_Any_LASX,
+       ARGBToRGB565DitherRow_LASX,
+       const uint32_t,
+       4,
+       2,
+       15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_LSX
+ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_LASX
+ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
+#endif
+#undef ANY11P
+#undef ANY11P
+
+// Any 1 to 1 with type
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)  \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]);                 \
+    SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]);                 \
+    memset(vin, 0, sizeof(vin)); /* for msan */                   \
+    int r = width & MASK;                                         \
+    int n = width & ~MASK;                                        \
+    if (n > 0) {                                                  \
+      ANY_SIMD(src_ptr, dst_ptr, n);                              \
+    }                                                             \
+    memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP);        \
+    ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1);                \
+    memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP);         \
+  }
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_SSSE3
+ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_SSSE3
+ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_NEON
+ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_NEON
+ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#undef ANY11T
+
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(STYPE vin[32]);                                             \
+    SIMD_ALIGNED(DTYPE vout[32]);                                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
+    }                                                                        \
+    memcpy(vin, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(vin, vout, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, vout, r * BPP);                                      \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+       Convert16To8Row_SSSE3,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_NEON
+ANY11C(Convert16To8Row_Any_NEON,
+       Convert16To8Row_NEON,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_AVX2
+ANY11C(MultiplyRow_16_Any_AVX2,
+       MultiplyRow_16_AVX2,
+       2,
+       2,
+       uint16_t,
+       uint16_t,
+       31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_NEON
+ANY11C(MultiplyRow_16_Any_NEON,
+       MultiplyRow_16_NEON,
+       2,
+       2,
+       uint16_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_DIVIDEROW_16_AVX2
+ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31)
+#endif
+#ifdef HAS_DIVIDEROW_16_NEON
+ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST vin[32]);                                           \
+    SIMD_ALIGNED(T vout[32]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                         \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(vin, src_ptr + n, r * SBPP);                                 \
+    ANY_SIMD(vin, vout, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, vout, r * BPP);                                 \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_LSX
+ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                 \
+               const struct YuvConstants* yuvconstants, int width) {     \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                       \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, yuvconstants, MASK + 1);                         \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
+  }
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+ANY11C(YUY2ToARGBRow_Any_LSX, YUY2ToARGBRow_LSX, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
+  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+               int width, int source_y_fraction) {                   \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
+    SIMD_ALIGNED(TD vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
+    }                                                                \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));          \
+    if (source_y_fraction) {                                         \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,              \
+             r * SBPP * sizeof(TS));                                 \
+    }                                                                \
+    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));           \
+  }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11I(InterpolateRow_Any_SSSE3,
+       InterpolateRow_SSSE3,
+       uint8_t,
+       uint8_t,
+       1,
+       1,
+       15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_LSX
+ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
+#endif
+
+#ifdef HAS_INTERPOLATEROW_16_NEON
+ANY11I(InterpolateRow_16_Any_NEON,
+       InterpolateRow_16_NEON,
+       uint16_t,
+       uint16_t,
+       1,
+       1,
+       7)
+#endif
+#undef ANY11I
+
+// Any 1 to 1 interpolate with scale param
+#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)                \
+  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride,       \
+               int scale, int width, int source_y_fraction) {              \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                          \
+    SIMD_ALIGNED(TD vout[64]);                                             \
+    memset(vin, 0, sizeof(vin)); /* for msan */                            \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
+    }                                                                      \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));                \
+    if (source_y_fraction) {                                               \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,                    \
+             r * SBPP * sizeof(TS));                                       \
+    }                                                                      \
+    ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction);           \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));                 \
+  }
+
+#ifdef HAS_INTERPOLATEROW_16TO8_NEON
+ANY11IS(InterpolateRow_16To8_Any_NEON,
+        InterpolateRow_16To8_NEON,
+        uint8_t,
+        uint16_t,
+        1,
+        1,
+        7)
+#endif
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+ANY11IS(InterpolateRow_16To8_Any_AVX2,
+        InterpolateRow_16To8_AVX2,
+        uint8_t,
+        uint16_t,
+        1,
+        1,
+        31)
+#endif
+
+#undef ANY11IS
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
+    memset(vin, 0, sizeof(vin)); /* for msan */                       \
+    int r = width & MASK;                                             \
+    int n = width & ~MASK;                                            \
+    if (n > 0) {                                                      \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
+    }                                                                 \
+    memcpy(vin, src_ptr, r* BPP);                                     \
+    ANY_SIMD(vin, vout, MASK + 1);                                    \
+    memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP);  \
+  }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_MIRRORROW_LSX
+ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_LASX
+ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_LSX
+ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_LASX
+ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_LSX
+ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_LASX
+ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8_t vout[64]);                  \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(vout, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);        \
+  }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_SETROW_LSX
+ANY1(SetRow_Any_LSX, SetRow_LSX, uint8_t, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_LSX
+ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)         \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+               int width) {                                            \
+    SIMD_ALIGNED(uint8_t vin[128]);                                    \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                               \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                              \
+    }                                                                  \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(vin, vout, vout + 128, MASK + 1);                         \
+    memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT));            \
+    memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT));      \
+  }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_LSX
+ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_LSX
+ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_LASX
+ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
+ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
+#endif
+#undef ANY12
+
+// Any 2 16 bit planes with parameter to 1
+#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                            \
+  void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
+    SIMD_ALIGNED(T vin[16 * 2]);                                            \
+    SIMD_ALIGNED(T vout[16 * 2]);                                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                             \
+    int r = width & MASK;                                                   \
+    int n = width & ~MASK;                                                  \
+    if (n > 0) {                                                            \
+      ANY_SIMD(src_uv, dst_u, dst_v, depth, n);                             \
+    }                                                                       \
+    memcpy(vin, src_uv + n * 2, r * BPP * 2);                               \
+    ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1);                        \
+    memcpy(dst_u + n, vout, r * BPP);                                       \
+    memcpy(dst_v + n, vout + 16, r * BPP);                                  \
+  }
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+
+#ifdef HAS_SPLITUVROW_16_NEON
+ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, int width) {                            \
+    SIMD_ALIGNED(uint8_t vin[16 * 3]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 3]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                       \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1);               \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+#ifdef HAS_SPLITXRGBROW_SSE2
+ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_SSSE3
+ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_AVX2
+ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITXRGBROW_NEON
+ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
+#endif
+
+// Any 1 to 4.  Outputs ARGB planes.
+#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                            \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+               uint8_t* dst_b, uint8_t* dst_a, int width) {            \
+    SIMD_ALIGNED(uint8_t vin[16 * 4]);                                 \
+    SIMD_ALIGNED(uint8_t vout[16 * 4]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                \
+    }                                                                  \
+    memcpy(vin, src_ptr + n * BPP, r * BPP);                           \
+    ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1);    \
+    memcpy(dst_r + n, vout, r);                                        \
+    memcpy(dst_g + n, vout + 16, r);                                   \
+    memcpy(dst_b + n, vout + 32, r);                                   \
+    memcpy(dst_a + n, vout + 48, r);                                   \
+  }
+
+#ifdef HAS_SPLITARGBROW_SSE2
+ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_SSSE3
+ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_AVX2
+ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITARGBROW_NEON
+ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
+#endif
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
+    }                                                                        \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
+             BPP);                                                           \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
+    }                                                                        \
+    ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1);                          \
+    memcpy(dst_u + (n >> 1), vout, SS(r, 1));                                \
+    memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1));                          \
+  }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVJROW_AVX2
+ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_LSX
+ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_LASX
+ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVJROW_NEON
+ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_LSX
+ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_LASX
+ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_LSX
+ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_LSX
+ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_LSX
+ANY12S(RGBAToUVRow_Any_LSX, RGBAToUVRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVJROW_NEON
+ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_LSX
+ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_LASX
+ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVJROW_NEON
+ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_LSX
+ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_LASX
+ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_LSX
+ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_LASX
+ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_LSX
+ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_LASX
+ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_LSX
+ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_LASX
+ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_LSX
+ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_LASX
+ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
+#endif
+#undef ANY12S
+
+// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu,      \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride, dst_vu, n);                              \
+    }                                                                        \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
+             BPP);                                                           \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
+    }                                                                        \
+    ANY_SIMD(vin, 128, vout, MASK + 1);                                      \
+    memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2);                       \
+  }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
+#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK)                           \
+  void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
+    SIMD_ALIGNED(T vin[16]);                                                 \
+    SIMD_ALIGNED(T vout[16]);                                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src, src_tile_stride, dst, n);                                \
+    }                                                                        \
+    memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP);                  \
+    ANY_SIMD(vin, src_tile_stride, vout, MASK + 1);                          \
+    memcpy(dst + n, vout, r * BPP);                                          \
+  }
+
+#ifdef HAS_DETILEROW_NEON
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_SSE2
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
+#endif
+#ifdef HAS_DETILEROW_16_NEON
+ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_SSE2
+ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_DETILEROW_16_AVX
+ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15)
+#endif
+
+// DetileSplitUVRow width is in bytes
+#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK)                \
+  void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
+               uint8_t* dst_u, uint8_t* dst_v, int width) {      \
+    SIMD_ALIGNED(uint8_t vin[16]);                               \
+    SIMD_ALIGNED(uint8_t vout[8 * 2]);                           \
+    memset(vin, 0, sizeof(vin)); /* for msan */                  \
+    int r = width & MASK;                                        \
+    int n = width & ~MASK;                                       \
+    if (n > 0) {                                                 \
+      ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n);        \
+    }                                                            \
+    memcpy(vin, src_uv + (n / 16) * src_tile_stride, r);         \
+    ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r);           \
+    memcpy(dst_u + n / 2, vout, (r + 1) / 2);                    \
+    memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2);                \
+  }
+
+#ifdef HAS_DETILESPLITUVROW_NEON
+ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
+#endif
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
+#endif
+
+#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK)                                \
+  void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride,              \
+               const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride,            \
+               uint8_t* dst_yuy2, int width) {                                 \
+    SIMD_ALIGNED(uint8_t vin[16 * 2]);                                         \
+    SIMD_ALIGNED(uint8_t vout[16 * 2]);                                        \
+    memset(vin, 0, sizeof(vin)); /* for msan */                                \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \
+               n);                                                             \
+    }                                                                          \
+    memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r);                      \
+    memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r);               \
+    ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r);   \
+    memcpy(dst_yuy2 + 2 * n, vout, 2 * r);                                     \
+  }
+
+#ifdef HAS_DETILETOYUY2_NEON
+ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
+#endif
+
+#ifdef HAS_DETILETOYUY2_SSE2
+ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/row_common.cc b/source/row_common.cc
index 8951d003..3afc4b4d 100644
--- a/files/source/row_common.cc
+++ b/source/row_common.cc
@@ -10,34 +10,72 @@
 
 #include "libyuv/row.h"
 
-#include <stdio.h>
+#include <assert.h>
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h"  // For kYuvI601Constants
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// This macro controls YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// Macros to enable unlimited data for each colorspace
+// LIBYUV_UNLIMITED_BT601
+// LIBYUV_UNLIMITED_BT709
+// LIBYUV_UNLIMITED_BT2020
+
+// The following macro from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point for ARGBToI420:
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+    defined(_MSC_VER) && !defined(__clang__) &&                   \
+    (defined(_M_IX86) || defined(_M_X64))
+#define LIBYUV_RGB7 1
+#endif
+
+#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
+                                   defined(__i386__) || defined(_M_IX86))
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.
 
 #define USE_BRANCHLESS 1
 #if USE_BRANCHLESS
 static __inline int32_t clamp0(int32_t v) {
-  return ((-(v) >> 31) & (v));
+  return -(v >= 0) & v;
 }
-
+// TODO(fbarchard): make clamp255 preserve negative values.
 static __inline int32_t clamp255(int32_t v) {
-  return (((255 - (v)) >> 31) | (v)) & 255;
+  return (-(v >= 255) | v) & 255;
 }
 
 static __inline int32_t clamp1023(int32_t v) {
-  return (((1023 - (v)) >> 31) | (v)) & 1023;
+  return (-(v >= 1023) | v) & 1023;
+}
+
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (-(v >= max) | v) & max;
 }
 
 static __inline uint32_t Abs(int32_t v) {
-  int m = v >> 31;
+  int m = -(v < 0);
   return (v + m) ^ m;
 }
 #else   // USE_BRANCHLESS
@@ -53,6 +91,10 @@ static __inline int32_t clamp1023(int32_t v) {
   return (v > 1023) ? 1023 : v;
 }
 
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (v > max) ? max : v;
+}
+
 static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
@@ -111,6 +153,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   }
 }
 
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
+    dst_rgba[0] = 255u;
+    dst_rgba[1] = b;
+    dst_rgba[2] = g;
+    dst_rgba[3] = r;
+    dst_rgba += 4;
+    src_raw += 3;
+  }
+}
+
 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -130,12 +187,13 @@ void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r = src_rgb565[1] >> 3;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 2) | (g >> 4);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = 255u;
     dst_argb += 4;
     src_rgb565 += 2;
@@ -147,13 +205,14 @@ void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t a = src_argb1555[1] >> 7;
-    dst_argb[0] = (b << 3) | (b >> 2);
-    dst_argb[1] = (g << 3) | (g >> 2);
-    dst_argb[2] = (r << 3) | (r >> 2);
+    uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_argb[3] = -a;
     dst_argb += 4;
     src_argb1555 += 2;
@@ -165,14 +224,14 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8_t b = src_argb4444[0] & 0x0f;
-    uint8_t g = src_argb4444[0] >> 4;
-    uint8_t r = src_argb4444[1] & 0x0f;
-    uint8_t a = src_argb4444[1] >> 4;
-    dst_argb[0] = (b << 4) | b;
-    dst_argb[1] = (g << 4) | g;
-    dst_argb[2] = (r << 4) | r;
-    dst_argb[3] = (a << 4) | a;
+    uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f);
+    uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4);
+    uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f);
+    uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4);
+    dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b);
+    dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g);
+    dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r);
+    dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a);
     dst_argb += 4;
     src_argb4444 += 2;
   }
@@ -181,7 +240,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
 void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = (ar30 >> 2) & 0xff;
     uint32_t g = (ar30 >> 12) & 0xff;
     uint32_t r = (ar30 >> 22) & 0xff;
@@ -195,7 +255,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
 void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = (ar30 >> 2) & 0xff;
     uint32_t g = (ar30 >> 12) & 0xff;
     uint32_t r = (ar30 >> 22) & 0xff;
@@ -209,7 +270,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
 void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t ar30;
+    memcpy(&ar30, src_ar30, sizeof ar30);
     uint32_t b = ar30 & 0x3ff;
     uint32_t ga = ar30 & 0xc00ffc00;
     uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -219,6 +281,54 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
   }
 }
 
+void ARGBToABGRRow_C(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_abgr[0] = r;
+    dst_abgr[1] = g;
+    dst_abgr[2] = b;
+    dst_abgr[3] = a;
+    dst_abgr += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToBGRARow_C(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_bgra[0] = a;
+    dst_bgra[1] = r;
+    dst_bgra[2] = g;
+    dst_bgra[3] = b;
+    dst_bgra += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
+    uint8_t a = src_argb[3];
+    dst_rgba[0] = a;
+    dst_rgba[1] = b;
+    dst_rgba[2] = g;
+    dst_rgba[3] = r;
+    dst_rgba += 4;
+    src_argb += 4;
+  }
+}
+
 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -247,6 +357,22 @@ void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   }
 }
 
+void RGBAToARGBRow_C(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t a = src_rgba[0];
+    uint8_t b = src_rgba[1];
+    uint8_t g = src_rgba[2];
+    uint8_t r = src_rgba[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_rgba += 4;
+  }
+}
+
 void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -265,7 +391,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t b0 = src_argb[0] >> 3;
     uint8_t g0 = src_argb[1] >> 2;
     uint8_t r0 = src_argb[2] >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -279,29 +405,31 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
 // or the upper byte for big endian.
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
-                           (r1 << 27));
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3);
+    uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2);
+    uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3);
+    uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2);
+    uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3);
+    *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -316,8 +444,10 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 3;
     uint8_t r1 = src_argb[6] >> 3;
     uint8_t a1 = src_argb[7] >> 7;
-    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -326,7 +456,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 3;
     uint8_t r0 = src_argb[2] >> 3;
     uint8_t a0 = src_argb[3] >> 7;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15));
   }
 }
 
@@ -341,8 +472,10 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 4;
     uint8_t r1 = src_argb[6] >> 4;
     uint8_t a1 = src_argb[7] >> 4;
-    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    *(uint16_t*)(dst_rgb + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
+    *(uint16_t*)(dst_rgb + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -351,18 +484,20 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g0 = src_argb[1] >> 4;
     uint8_t r0 = src_argb[2] >> 4;
     uint8_t a0 = src_argb[3] >> 4;
-    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    *(uint16_t*)(dst_rgb) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12));
   }
 }
 
 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
-    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
     uint32_t a0 = (src_abgr[3] >> 6);
-    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_abgr += 4;
   }
@@ -375,62 +510,249 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
     uint32_t a0 = (src_argb[3] >> 6);
-    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    *(uint32_t*)(dst_ar30) =
+        STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30));
     dst_ar30 += 4;
     src_argb += 4;
   }
 }
 
-static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ar64[0] = b;
+    dst_ar64[1] = g;
+    dst_ar64[2] = r;
+    dst_ar64[3] = a;
+    dst_ar64 += 4;
+    src_argb += 4;
+  }
+}
+
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint16_t b = src_argb[0] * 0x0101;
+    uint16_t g = src_argb[1] * 0x0101;
+    uint16_t r = src_argb[2] * 0x0101;
+    uint16_t a = src_argb[3] * 0x0101;
+    dst_ab64[0] = r;
+    dst_ab64[1] = g;
+    dst_ab64[2] = b;
+    dst_ab64[3] = a;
+    dst_ab64 += 4;
+    src_argb += 4;
+  }
+}
+
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_ar64[0] >> 8;
+    uint8_t g = src_ar64[1] >> 8;
+    uint8_t r = src_ar64[2] >> 8;
+    uint8_t a = src_ar64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_ar64 += 4;
+  }
+}
+
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t r = src_ab64[0] >> 8;
+    uint8_t g = src_ab64[1] >> 8;
+    uint8_t b = src_ab64[2] >> 8;
+    uint8_t a = src_ab64[3] >> 8;
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_ab64 += 4;
+  }
+}
+
+void AR64ToAB64Row_C(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint16_t b = src_ar64[0];
+    uint16_t g = src_ar64[1];
+    uint16_t r = src_ar64[2];
+    uint16_t a = src_ar64[3];
+    dst_ab64[0] = r;
+    dst_ab64[1] = g;
+    dst_ab64[2] = b;
+    dst_ab64[3] = a;
+    dst_ab64 += 4;
+    src_ar64 += 4;
+  }
+}
+
+// TODO(fbarchard): Make shuffle compatible with SIMD versions
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+                      uint8_t* dst_ar64,
+                      const uint8_t* shuffler,
+                      int width) {
+  const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
+  uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
+  int index0 = shuffler[0] / 2;
+  int index1 = shuffler[2] / 2;
+  int index2 = shuffler[4] / 2;
+  int index3 = shuffler[6] / 2;
+  // Shuffle a row of AR64.
+  int x;
+  for (x = 0; x < width / 2; ++x) {
+    // To support in-place conversion.
+    uint16_t b = src_ar64_16[index0];
+    uint16_t g = src_ar64_16[index1];
+    uint16_t r = src_ar64_16[index2];
+    uint16_t a = src_ar64_16[index3];
+    dst_ar64_16[0] = b;
+    dst_ar64_16[1] = g;
+    dst_ar64_16[2] = r;
+    dst_ar64_16[3] = a;
+    src_ar64_16 += 4;
+    dst_ar64_16 += 4;
+  }
+}
+
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16);
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+//  0x7e80) >> 8;
+
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
+}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
+}
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
 }
+#else
+// TODO(fbarchard): Add rounding to x86 SIMD and use this
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
+}
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
+}
+#endif
 
-static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
 }
-static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
 }
+#endif
 
 // ARGBToY_C and ARGBToUV_C
-#define MAKEROWY(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                   \
-    for (x = 0; x < width; ++x) {                                            \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                      \
-      dst_y += 1;                                                            \
-    }                                                                        \
-  }                                                                          \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
-    int x;                                                                   \
-    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
-                    src_rgb1[B + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
-                    src_rgb1[G + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
-                    src_rgb1[R + BPP]) >>                                    \
-                   2;                                                        \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                   \
-      dst_u += 1;                                                            \
-      dst_v += 1;                                                            \
-    }                                                                        \
-    if (width & 1) {                                                         \
-      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
-      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
-      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
-      dst_u[0] = RGBToU(ar, ag, ab);                                         \
-      dst_v[0] = RGBToV(ar, ag, ab);                                         \
-    }                                                                        \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+    }                                                                      \
+  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                             \
+                    1;                                                     \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
+      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
+      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+    }                                                                      \
   }
+#endif
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -448,14 +770,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // b 0.1016 * 255 = 25.908 = 25
 // g 0.5078 * 255 = 129.489 = 129
 // r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
 // b 0.11400 * 128 = 14.592 = 15
 // g 0.58700 * 128 = 75.136 = 75
 // r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
 // JPeg 8 bit U:
 // b  0.50000 * 255 = 127.5 = 127
 // g -0.33126 * 255 = -84.4713 = -84
@@ -465,68 +787,132 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // g -0.41869 * 255 = -106.76595 = -107
 // r  0.50000 * 255 = 127.5 = 127
 
-static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
   return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
+#else
+// 8 bit
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
 
-static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
 
 // ARGBToYJ_C and ARGBToUVJ_C
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
-  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                    \
-    for (x = 0; x < width; ++x) {                                             \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
-      src_argb0 += BPP;                                                       \
-      dst_y += 1;                                                             \
-    }                                                                         \
-  }                                                                           \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
-    int x;                                                                    \
-    for (x = 0; x < width - 1; x += 2) {                                      \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
-                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
-                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
-                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-      src_rgb0 += BPP * 2;                                                    \
-      src_rgb1 += BPP * 2;                                                    \
-      dst_u += 1;                                                             \
-      dst_v += 1;                                                             \
-    }                                                                         \
-    if (width & 1) {                                                          \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
-    }                                                                         \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+    }                                                                       \
   }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                              \
+                    1;                                                      \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+    }                                                                       \
+  }
+
+#endif
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(ABGR, 0, 1, 2, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
 #undef MAKEROWYJ
 
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_rgb565[0] & 0x1f;
-    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
     uint8_t r = src_rgb565[1] >> 3;
-    b = (b << 3) | (b >> 2);
-    g = (g << 2) | (g >> 4);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_rgb565 += 2;
     dst_y += 1;
@@ -537,11 +923,12 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_argb1555[0] & 0x1f;
-    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t g = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
-    b = (b << 3) | (b >> 2);
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
+    b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+    g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2));
+    r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
     dst_y[0] = RGBToY(r, g, b);
     src_argb1555 += 2;
     dst_y += 1;
@@ -554,9 +941,9 @@ void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
     uint8_t b = src_argb4444[0] & 0x0f;
     uint8_t g = src_argb4444[0] >> 4;
     uint8_t r = src_argb4444[1] & 0x0f;
-    b = (b << 4) | b;
-    g = (g << 4) | g;
-    r = (r << 4) | r;
+    b = STATIC_CAST(uint8_t, (b << 4) | b);
+    g = STATIC_CAST(uint8_t, (g << 4) | g);
+    r = STATIC_CAST(uint8_t, (r << 4) | r);
     dst_y[0] = RGBToY(r, g, b);
     src_argb4444 += 2;
     dst_y += 1;
@@ -571,45 +958,84 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b1 = src_rgb565[2] & 0x1f;
-    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8_t r1 = src_rgb565[3] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b3 = next_rgb565[2] & 0x1f;
-    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8_t r3 = next_rgb565[3] >> 3;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 787 -> 888.
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_rgb565 += 4;
     next_rgb565 += 4;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_rgb565[0] & 0x1f;
-    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8_t r0 = src_rgb565[1] >> 3;
-    uint8_t b2 = next_rgb565[0] & 0x1f;
-    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8_t r2 = next_rgb565[1] >> 3;
-    uint8_t b = (b0 + b2);  // 565 * 2 = 676.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 676 -> 888
-    g = (g << 1) | (g >> 6);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -621,46 +1047,85 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b1 = src_argb1555[2] & 0x1f;
-    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8_t b3 = next_argb1555[2] & 0x1f;
-    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 1) | (b >> 6);  // 777 -> 888.
-    g = (g << 1) | (g >> 6);
-    r = (r << 1) | (r >> 6);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f);
+    uint8_t g1 = STATIC_CAST(
+        uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3));
+    uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+    uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f);
+    uint8_t g3 = STATIC_CAST(
+        uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3));
+    uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+    g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2));
+    r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+    b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+    g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
+    r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_argb1555 += 4;
     next_argb1555 += 4;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = src_argb1555[0] & 0x1f;
-    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8_t b2 = next_argb1555[0] & 0x1f;
-    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8_t r2 = next_argb1555[1] >> 3;
-    uint8_t b = (b0 + b2);  // 555 * 2 = 666.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+    uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f);
+    uint8_t g0 = STATIC_CAST(
+        uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3));
+    uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2);
+    uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f);
+    uint8_t g2 = STATIC_CAST(
+        uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3));
+    uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+    g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2));
+    r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+    b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+    g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2));
+    r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -684,14 +1149,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t b3 = next_argb4444[2] & 0x0f;
     uint8_t g3 = next_argb4444[2] >> 4;
     uint8_t r3 = next_argb4444[3] & 0x0f;
-    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8_t g = (g0 + g1 + g2 + g3);
-    uint8_t r = (r0 + r1 + r2 + r3);
-    b = (b << 2) | (b >> 4);  // 666 -> 888.
-    g = (g << 2) | (g >> 4);
-    r = (r << 2) | (r >> 4);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1);
+    g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1);
+    r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+    b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3);
+    g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
+    r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+    uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+    uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
     src_argb4444 += 4;
     next_argb4444 += 4;
     dst_u += 1;
@@ -704,14 +1189,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     uint8_t b2 = next_argb4444[0] & 0x0f;
     uint8_t g2 = next_argb4444[0] >> 4;
     uint8_t r2 = next_argb4444[1] & 0x0f;
-    uint8_t b = (b0 + b2);  // 444 * 2 = 555.
-    uint8_t g = (g0 + g2);
-    uint8_t r = (r0 + r2);
-    b = (b << 3) | (b >> 2);  // 555 -> 888.
-    g = (g << 3) | (g >> 2);
-    r = (r << 3) | (r >> 2);
-    dst_u[0] = RGBToU(r, g, b);
-    dst_v[0] = RGBToV(r, g, b);
+
+    b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0);
+    g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0);
+    r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0);
+    b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2);
+    g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2);
+    r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+    uint8_t ab = AVGB(b0, b2);
+    uint8_t ag = AVGB(g0, g2);
+    uint8_t ar = AVGB(r0, r2);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+#else
+    uint16_t b = b0 + b2;
+    uint16_t g = g0 + g2;
+    uint16_t r = r0 + r2;
+    dst_u[0] = RGB2xToU(r, g, b);
+    dst_v[0] = RGB2xToV(r, g, b);
+#endif
   }
 }
 
@@ -754,9 +1252,9 @@ void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
     int sg = (b * 22 + g * 88 + r * 45) >> 7;
     int sr = (b * 24 + g * 98 + r * 50) >> 7;
     // b does not over flow. a is preserved from original.
-    dst_argb[0] = sb;
-    dst_argb[1] = clamp255(sg);
-    dst_argb[2] = clamp255(sr);
+    dst_argb[0] = STATIC_CAST(uint8_t, sb);
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr));
     dst_argb += 4;
   }
 }
@@ -785,10 +1283,10 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb,
     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
               a * matrix_argb[15]) >>
              6;
-    dst_argb[0] = Clamp(sb);
-    dst_argb[1] = Clamp(sg);
-    dst_argb[2] = Clamp(sr);
-    dst_argb[3] = Clamp(sa);
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -838,9 +1336,12 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb,
     int b = dst_argb[0];
     int g = dst_argb[1];
     int r = dst_argb[2];
-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb[0] = STATIC_CAST(
+        uint8_t, (b * scale >> 16) * interval_size + interval_offset);
+    dst_argb[1] = STATIC_CAST(
+        uint8_t, (g * scale >> 16) * interval_size + interval_offset);
+    dst_argb[2] = STATIC_CAST(
+        uint8_t, (r * scale >> 16) * interval_size + interval_offset);
     dst_argb += 4;
   }
 }
@@ -877,25 +1378,25 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
 #define REPEAT8(v) (v) | ((v) << 8)
 #define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32_t b = REPEAT8(src_argb0[0]);
-    const uint32_t g = REPEAT8(src_argb0[1]);
-    const uint32_t r = REPEAT8(src_argb0[2]);
-    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     const uint32_t b_scale = src_argb1[0];
     const uint32_t g_scale = src_argb1[1];
     const uint32_t r_scale = src_argb1[2];
     const uint32_t a_scale = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_scale);
-    dst_argb[1] = SHADE(g, g_scale);
-    dst_argb[2] = SHADE(r, r_scale);
-    dst_argb[3] = SHADE(a, a_scale);
-    src_argb0 += 4;
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale));
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -905,25 +1406,25 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
                   const uint8_t* src_argb1,
                   uint8_t* dst_argb,
                   int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_add = src_argb1[0];
     const int g_add = src_argb1[1];
     const int r_add = src_argb1[2];
     const int a_add = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_add);
-    dst_argb[1] = SHADE(g, g_add);
-    dst_argb[2] = SHADE(r, r_add);
-    dst_argb[3] = SHADE(a, a_add);
-    src_argb0 += 4;
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add));
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -932,25 +1433,25 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const int b = src_argb0[0];
-    const int g = src_argb0[1];
-    const int r = src_argb0[2];
-    const int a = src_argb0[3];
+    const int b = src_argb[0];
+    const int g = src_argb[1];
+    const int r = src_argb[2];
+    const int a = src_argb[3];
     const int b_sub = src_argb1[0];
     const int g_sub = src_argb1[1];
     const int r_sub = src_argb1[2];
     const int a_sub = src_argb1[3];
-    dst_argb[0] = SHADE(b, b_sub);
-    dst_argb[1] = SHADE(g, g_sub);
-    dst_argb[2] = SHADE(r, r_sub);
-    dst_argb[3] = SHADE(a, a_sub);
-    src_argb0 += 4;
+    dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub));
+    dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub));
+    dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub));
+    dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub));
+    src_argb += 4;
     src_argb1 += 4;
     dst_argb += 4;
   }
@@ -1058,257 +1559,244 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   }
 }
 
-// TODO(fbarchard): Unify these structures to be platform independent.
-// TODO(fbarchard): Generate SIMD structures from float matrix.
+// Macros to create SIMD specific yuv to rgb conversion constants.
 
-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
+// clang-format off
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
+  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
+   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+    0, 0}}
+#else
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
+  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
+   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
+   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
+#endif
+
+// clang-format on
+
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.596
+//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+//  B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
 
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25   /* round(0.391 * 64) */
-#define VG 52   /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)  // 64 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)  // 32 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
-    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
-    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
+#define UB 129 /* round(2.018 * 64) */
 #else
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
-    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
-    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
-    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
-    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.018 * 64)) */
 #endif
+#define UG 25  /* round(0.391 * 64) */
+#define VG 52  /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
 
-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// *  R = Y               + V * 1.40200
+// *  G = Y - U * 0.34414 - V * 0.71414
+// *  B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22  /* round(0.34414 * 64) */
+#define VG 46  /* round(0.71414 * 64) */
+#define VR 90  /* round(1.40200 * 64) */
 
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32   /* 64 / 2 */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.709 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.793
+//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+//  B = (Y - 16) * 1.164 + U * 2.112
+//  KR = 0.2126, KB = 0.0722
 
 // U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22   /* round(0.34414 * 64) */
-#define VG 46   /* round(0.71414  * 64) */
-#define VR -90  /* round(-1.40200 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
-    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
-    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
+#define UB 135 /* round(2.112 * 64) */
 #else
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
-    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
-    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
-    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
-    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.112 * 64)) */
 #endif
+#define UG 14  /* round(0.213 * 64) */
+#define VG 34  /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG
 
-// BT.709 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.793
-//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
-//  B = (Y - 16) * 1.164 - U * -2.112
-// See also http://www.equasys.de/colorconversion.html
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126, KB = 0.0722
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// U and V contributions to R,G,B.
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12  /* round(0.18732 * 64) */
+#define VG 30  /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.2020 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164384                + V * 1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+//  B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
 
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14   /* round(0.213 * 64) */
-#define VG 34   /* round(0.533  * 64) */
-#define VR -115 /* round(-1.793 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {UG, VG, UG, VG, UG, VG, UG, VG},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {VG, UG, VG, UG, VG, UG, VG, UG},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
-    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BB, BG, BR, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
-    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
-    {BR, BG, BB, 0, 0, 0, 0, 0},
-    {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
+#define UB 137 /* round(2.142 * 64) */
 #else
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
-    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
-    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
-    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
-    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
-    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
-    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
-    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
-    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.142 * 64)) */
 #endif
+#define UG 12  /* round(0.187326 * 64) */
+#define VG 42  /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
 
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
+
+// BT.2020 full range YUV to RGB reference
+//  R = Y                + V * 1.474600
+//  G = Y - U * 0.164553 - V * 0.571353
+//  B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11  /* round(0.164553 * 64) */
+#define VG 37  /* round(0.571353 * 64) */
+#define VR 94  /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
+
 #undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
+
+#undef MAKEYUVCONSTANTS
+
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#define LOAD_YUV_CONSTANTS                 \
+  int ub = yuvconstants->kUVCoeff[0];      \
+  int vr = yuvconstants->kUVCoeff[1];      \
+  int ug = yuvconstants->kUVCoeff[2];      \
+  int vg = yuvconstants->kUVCoeff[3];      \
+  int yg = yuvconstants->kRGBCoeffBias[0]; \
+  int bb = yuvconstants->kRGBCoeffBias[1]; \
+  int bg = yuvconstants->kRGBCoeffBias[2]; \
+  int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16                         \
+  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+  int b16 = y1 + (u * ub) - bb;            \
+  int g16 = y1 + bg - (u * ug + v * vg);   \
+  int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS           \
+  int ub = yuvconstants->kUVToB[0];  \
+  int ug = yuvconstants->kUVToG[0];  \
+  int vg = yuvconstants->kUVToG[1];  \
+  int vr = yuvconstants->kUVToR[1];  \
+  int yg = yuvconstants->kYToRgb[0]; \
+  int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16                                \
+  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+  int8_t ui = (int8_t)u;                          \
+  int8_t vi = (int8_t)v;                          \
+  ui -= 0x80;                                     \
+  vi -= 0x80;                                     \
+  int b16 = y1 + (ui * ub);                       \
+  int g16 = y1 - (ui * ug + vi * vg);             \
+  int r16 = y1 + (vi * vr)
+#endif
 
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 16 bit.
-
 static __inline void YuvPixel(uint8_t y,
                               uint8_t u,
                               uint8_t v,
@@ -1316,39 +1804,12 @@ static __inline void YuvPixel(uint8_t y,
                               uint8_t* g,
                               uint8_t* r,
                               const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
-  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
-  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
 // Reads 8 bit YUV and leaves result as 16 bit.
@@ -1359,85 +1820,50 @@ static __inline void YuvPixel8_16(uint8_t y,
                                   int* g,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 16 bit assembly.
 // Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16(int16_t y,
-                                int16_t u,
-                                int16_t v,
-                                int* b,
-                                int* g,
-                                int* r,
-                                const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = -yuvconstants->kUVToRB[1];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
-  int ub = -yuvconstants->kUVToRB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[4];
-  int vr = -yuvconstants->kUVToRB[4];
-  int bb = yuvconstants->kUVBiasBGR[0];
-  int bg = yuvconstants->kUVBiasBGR[1];
-  int br = yuvconstants->kUVBiasBGR[2];
-  int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
-  int ub = yuvconstants->kUVToB[0];
-  int ug = yuvconstants->kUVToG[0];
-  int vg = yuvconstants->kUVToG[1];
-  int vr = yuvconstants->kUVToR[1];
-  int bb = yuvconstants->kUVBiasB[0];
-  int bg = yuvconstants->kUVBiasG[0];
-  int br = yuvconstants->kUVBiasR[0];
-  int yg = yuvconstants->kYToRgb[0];
-#endif
+static __inline void YuvPixel10_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = (y << 6) | (y >> 4);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 2));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 2));
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
 
-  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
-  u = clamp255(u >> 2);
-  v = clamp255(v >> 2);
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+                                   int16_t u,
+                                   int16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = (y << 4) | (y >> 8);
+  u = STATIC_CAST(uint8_t, clamp255(u >> 4));
+  v = STATIC_CAST(uint8_t, clamp255(v >> 4));
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }
 
 // C reference code that mimics the YUV 10 bit assembly.
@@ -1452,31 +1878,89 @@ static __inline void YuvPixel10(uint16_t y,
   int b16;
   int g16;
   int r16;
-  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
-  *b = Clamp(b16 >> 6);
-  *g = Clamp(g16 >> 6);
-  *r = Clamp(r16 >> 6);
+  YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
 }
 
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6));
+}
 
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
-  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32_t)(y1 + YGB) >> 6);
-  *g = Clamp((int32_t)(y1 + YGB) >> 6);
-  *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 8 bit.
+static __inline void YuvPixel16_8(uint16_t y,
+                                  uint16_t u,
+                                  uint16_t v,
+                                  uint8_t* b,
+                                  uint8_t* g,
+                                  uint8_t* r,
+                                  const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
+  CALC_RGB16;
+  *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6));
+  *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6));
+  *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6));
 }
 
-#undef YG
-#undef YGB
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16_16(uint16_t y,
+                                   uint16_t u,
+                                   uint16_t v,
+                                   int* b,
+                                   int* g,
+                                   int* r,
+                                   const struct YuvConstants* yuvconstants) {
+  LOAD_YUV_CONSTANTS;
+  uint32_t y32 = y;
+  u = STATIC_CAST(uint16_t, clamp255(u >> 8));
+  v = STATIC_CAST(uint16_t, clamp255(v >> 8));
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
+}
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 8 bit.
+static __inline void YPixel(uint8_t y,
+                            uint8_t* b,
+                            uint8_t* g,
+                            uint8_t* r,
+                            const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+  int yg = yuvconstants->kRGBCoeffBias[0];
+  int ygb = yuvconstants->kRGBCoeffBias[4];
+#else
+  int ygb = yuvconstants->kYBiasToRgb[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  uint8_t b8 = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6));
+  *b = b8;
+  *g = b8;
+  *r = b8;
+}
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
 void I444ToARGBRow_C(const uint8_t* src_y,
                      const uint8_t* src_u,
                      const uint8_t* src_v,
@@ -1484,45 +1968,33 @@ void I444ToARGBRow_C(const uint8_t* src_y,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-             yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-             yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
+  for (x = 0; x < width; ++x) {
     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
              rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
   }
 }
-#else
-void I444ToARGBRow_C(const uint8_t* src_y,
-                     const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
+
+void I444ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
              rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
     src_y += 1;
     src_u += 1;
     src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
+    rgb_buf += 3;  // Advance 1 pixel.
   }
 }
-#endif
 
 // Also used for 420
 void I422ToARGBRow_C(const uint8_t* src_y,
@@ -1578,9 +2050,102 @@ void I210ToARGBRow_C(const uint16_t* src_y,
   }
 }
 
+void I410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+  }
+}
+
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+                          const uint16_t* src_u,
+                          const uint16_t* src_v,
+                          const uint16_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2));
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    src_a += 1;
+    rgb_buf += 4;  // Advance 1 pixels.
+  }
+}
+
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
   uint32_t ar30;
-  b = b >> 4;  // convert 10.6 to 10 bit.
+  b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
   g = g >> 4;
   r = r >> 4;
   b = Clamp10(b);
@@ -1602,9 +2167,9 @@ void I210ToAR30Row_C(const uint16_t* src_y,
   int g;
   int r;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
-    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf + 4, b, g, r);
     src_y += 2;
     src_u += 1;
@@ -1612,11 +2177,141 @@ void I210ToAR30Row_C(const uint16_t* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
     StoreAR30(rgb_buf, b, g, r);
   }
 }
 
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+void I410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
+void P210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
+                 dst_argb + 6, yuvconstants);
+    dst_argb[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    dst_argb += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+  }
+}
+
+void P410ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+                 dst_argb + 2, yuvconstants);
+    dst_argb[3] = 255;
+    src_y += 1;
+    src_uv += 2;
+    dst_argb += 4;  // Advance 1 pixels.
+  }
+}
+
+void P210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30 + 4, b, g, r);
+    src_y += 2;
+    src_uv += 2;
+    dst_ar30 += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+  }
+}
+
+void P410ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_uv,
+                     uint8_t* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width; ++x) {
+    YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+    StoreAR30(dst_ar30, b, g, r);
+    src_y += 1;
+    src_uv += 2;
+    dst_ar30 += 4;  // Advance 1 pixel.
+  }
+}
+
 // 8 bit YUV to 10 bit AR30
 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
 void I422ToAR30Row_C(const uint8_t* src_y,
@@ -1645,6 +2340,26 @@ void I422ToAR30Row_C(const uint8_t* src_y,
   }
 }
 
+void I444AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    src_a += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+
 void I422AlphaToARGBRow_C(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -1718,8 +2433,10 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
-                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
+    *(uint16_t*)(dst_argb4444 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
+    *(uint16_t*)(dst_argb4444 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1730,7 +2447,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+    *(uint16_t*)(dst_argb4444) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000);
   }
 }
 
@@ -1756,8 +2474,10 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
-                                 (g1 << 21) | (r1 << 26) | 0x80008000;
+    *(uint16_t*)(dst_argb1555 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
+    *(uint16_t*)(dst_argb1555 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1768,7 +2488,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+    *(uint16_t*)(dst_argb1555) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000);
   }
 }
 
@@ -1794,8 +2515,10 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32_t*)(dst_rgb565) =
-        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
+    *(uint16_t*)(dst_rgb565 + 2) =
+        STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11));
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1806,7 +2529,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 0) =
+        STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11));
   }
 }
 
@@ -1921,8 +2645,12 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32_t*)(dst_rgb565) =
-        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) |
+                                   STATIC_CAST(uint16_t, g0 << 5) |
+                                   STATIC_CAST(uint16_t, r0 << 11);
+    *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) |
+                                   STATIC_CAST(uint16_t, g1 << 5) |
+                                   STATIC_CAST(uint16_t, r1 << 11);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -1932,7 +2660,9 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) |
+                               STATIC_CAST(uint16_t, g0 << 5) |
+                               STATIC_CAST(uint16_t, r0 << 11);
   }
 }
 
@@ -2006,18 +2736,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
   }
 }
 
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -2035,10 +2768,34 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8_t* src_uv,
-                   uint8_t* dst_u,
-                   uint8_t* dst_v,
-                   int width) {
+void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = src_uv[0];
+    dst_uv[1] = src_uv[1];
+    src_uv -= 2;
+    dst_uv += 2;
+  }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -2069,6 +2826,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+  int x;
+  src_rgb24 += width * 3 - 3;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    src_rgb24 -= 3;
+    dst_rgb24 += 3;
+  }
+}
+
 void SplitUVRow_C(const uint8_t* src_uv,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
@@ -2105,6 +2877,98 @@ void MergeUVRow_C(const uint8_t* src_u,
   }
 }
 
+void DetileRow_C(const uint8_t* src,
+                 ptrdiff_t src_tile_stride,
+                 uint8_t* dst,
+                 int width) {
+  int x;
+  for (x = 0; x < width - 15; x += 16) {
+    memcpy(dst, src, 16);
+    dst += 16;
+    src += src_tile_stride;
+  }
+  if (width & 15) {
+    memcpy(dst, src, width & 15);
+  }
+}
+
+void DetileRow_16_C(const uint16_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint16_t* dst,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 15; x += 16) {
+    memcpy(dst, src, 16 * sizeof(uint16_t));
+    dst += 16;
+    src += src_tile_stride;
+  }
+  if (width & 15) {
+    memcpy(dst, src, (width & 15) * sizeof(uint16_t));
+  }
+}
+
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  for (x = 0; x < width - 15; x += 16) {
+    SplitUVRow_C(src_uv, dst_u, dst_v, 8);
+    dst_u += 8;
+    dst_v += 8;
+    src_uv += src_tile_stride;
+  }
+  if (width & 15) {
+    SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2);
+  }
+}
+
+void DetileToYUY2_C(const uint8_t* src_y,
+                    ptrdiff_t src_y_tile_stride,
+                    const uint8_t* src_uv,
+                    ptrdiff_t src_uv_tile_stride,
+                    uint8_t* dst_yuy2,
+                    int width) {
+  for (int x = 0; x < width - 15; x += 16) {
+    for (int i = 0; i < 8; i++) {
+      dst_yuy2[0] = src_y[0];
+      dst_yuy2[1] = src_uv[0];
+      dst_yuy2[2] = src_y[1];
+      dst_yuy2[3] = src_uv[1];
+      dst_yuy2 += 4;
+      src_y += 2;
+      src_uv += 2;
+    }
+    src_y += src_y_tile_stride - 16;
+    src_uv += src_uv_tile_stride - 16;
+  }
+}
+
+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
+// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
+// order.
+void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
+  for (size_t i = 0; i < size; i += 80) {
+    const uint8_t* src_lower_bits = src;
+    const uint8_t* src_upper_bits = src + 16;
+
+    for (int j = 0; j < 4; j++) {
+      for (int k = 0; k < 16; k++) {
+        *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
+                 (uint16_t)*src_upper_bits << 8 |
+                 (uint16_t)*src_upper_bits >> 2;
+        src_upper_bits++;
+      }
+    }
+
+    src += 80;
+  }
+}
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
@@ -2133,27 +2997,197 @@ void MergeRGBRow_C(const uint8_t* src_r,
   }
 }
 
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
+void SplitARGBRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_r,
+                    uint8_t* dst_g,
+                    uint8_t* dst_b,
+                    uint8_t* dst_a,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_b[x] = src_argb[0];
+    dst_g[x] = src_argb[1];
+    dst_r[x] = src_argb[2];
+    dst_a[x] = src_argb[3];
+    src_argb += 4;
+  }
+}
+
+void MergeARGBRow_C(const uint8_t* src_r,
+                    const uint8_t* src_g,
+                    const uint8_t* src_b,
+                    const uint8_t* src_a,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_b[x];
+    dst_argb[1] = src_g[x];
+    dst_argb[2] = src_r[x];
+    dst_argb[3] = src_a[x];
+    dst_argb += 4;
+  }
+}
+
+void MergeXR30Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint8_t* dst_ar30,
+                    int depth,
+                    int width) {
+  assert(depth >= 10);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 10;
+  uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
+  for (x = 0; x < width; ++x) {
+    uint32_t r = clamp1023(src_r[x] >> shift);
+    uint32_t g = clamp1023(src_g[x] >> shift);
+    uint32_t b = clamp1023(src_b[x] >> shift);
+    dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
+  }
+}
+
+void MergeAR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    const uint16_t* src_a,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+    dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift);
+    dst_ar64 += 4;
+  }
+}
+
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         const uint16_t* src_a,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+    dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift));
+    dst_argb += 4;
+  }
+}
+
+void MergeXR64Row_C(const uint16_t* src_r,
+                    const uint16_t* src_g,
+                    const uint16_t* src_b,
+                    uint16_t* dst_ar64,
+                    int depth,
+                    int width) {
+  assert(depth >= 1);
+  assert(depth <= 16);
+  int x;
+  int shift = 16 - depth;
+  int max = (1 << depth) - 1;
+  for (x = 0; x < width; ++x) {
+    dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift);
+    dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift);
+    dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift);
+    dst_ar64[3] = 0xffff;
+    dst_ar64 += 4;
+  }
+}
+
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+                         const uint16_t* src_g,
+                         const uint16_t* src_b,
+                         uint8_t* dst_argb,
+                         int depth,
+                         int width) {
+  assert(depth >= 8);
+  assert(depth <= 16);
+  int x;
+  int shift = depth - 8;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift));
+    dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift));
+    dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift));
+    dst_argb[3] = 0xff;
+    dst_argb += 4;
+  }
+}
+
+void SplitXRGBRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_r,
+                    uint8_t* dst_g,
+                    uint8_t* dst_b,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_b[x] = src_argb[0];
+    dst_g[x] = src_argb[1];
+    dst_r[x] = src_argb[2];
+    src_argb += 4;
+  }
+}
+
+void MergeXRGBRow_C(const uint8_t* src_r,
+                    const uint8_t* src_g,
+                    const uint8_t* src_b,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_argb[0] = src_b[x];
+    dst_argb[1] = src_g[x];
+    dst_argb[2] = src_r[x];
+    dst_argb[3] = 255;
+    dst_argb += 4;
+  }
+}
+
+// Convert lsb formats to msb, depending on sample depth.
 void MergeUVRow_16_C(const uint16_t* src_u,
                      const uint16_t* src_v,
                      uint16_t* dst_uv,
-                     int scale,
+                     int depth,
                      int width) {
+  int shift = 16 - depth;
+  assert(depth >= 8);
+  assert(depth <= 16);
   int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x] * scale;
-    dst_uv[1] = src_v[x] * scale;
-    dst_uv[2] = src_u[x + 1] * scale;
-    dst_uv[3] = src_v[x + 1] * scale;
-    dst_uv += 4;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift);
+    dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift);
+    dst_uv += 2;
   }
-  if (width & 1) {
-    dst_uv[0] = src_u[width - 1] * scale;
-    dst_uv[1] = src_v[width - 1] * scale;
+}
+
+// Convert msb formats to lsb, depending on sample depth.
+void SplitUVRow_16_C(const uint16_t* src_uv,
+                     uint16_t* dst_u,
+                     uint16_t* dst_v,
+                     int depth,
+                     int width) {
+  int shift = 16 - depth;
+  int x;
+  assert(depth >= 8);
+  assert(depth <= 16);
+  for (x = 0; x < width; ++x) {
+    dst_u[x] = src_uv[0] >> shift;
+    dst_v[x] = src_uv[1] >> shift;
+    src_uv += 2;
   }
 }
 
@@ -2163,7 +3197,17 @@ void MultiplyRow_16_C(const uint16_t* src_y,
                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[x] = src_y[x] * scale;
+    dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale);
+  }
+}
+
+void DivideRow_16_C(const uint16_t* src_y,
+                    uint16_t* dst_y,
+                    int scale,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
   }
 }
 
@@ -2172,13 +3216,19 @@ void MultiplyRow_16_C(const uint16_t* src_y,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
 void Convert16To8Row_C(const uint16_t* src_y,
                        uint8_t* dst_y,
                        int scale,
                        int width) {
   int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+
   for (x = 0; x < width; ++x) {
-    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+    dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale));
   }
 }
 
@@ -2208,10 +3258,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
 }
 
 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
-  uint32_t* d = (uint32_t*)(dst_argb);
   int x;
   for (x = 0; x < width; ++x) {
-    d[x] = v32;
+    memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
   }
 }
 
@@ -2232,6 +3281,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
   }
 }
 
+// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
+void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_uv,
+                     int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_uv += 2;
+  }
+}
+
 // Copy row of YUY2 UV's (422) into U and V (422).
 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
@@ -2309,56 +3373,56 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   }
 }
 
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
 
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
 
-    fb = src_argb0[4 + 0];
-    fg = src_argb0[4 + 1];
-    fr = src_argb0[4 + 2];
-    a = src_argb0[4 + 3];
+    fb = src_argb[4 + 0];
+    fg = src_argb[4 + 1];
+    fr = src_argb[4 + 2];
+    a = src_argb[4 + 3];
     bb = src_argb1[4 + 0];
     bg = src_argb1[4 + 1];
     br = src_argb1[4 + 2];
-    dst_argb[4 + 0] = BLEND(fb, bb, a);
-    dst_argb[4 + 1] = BLEND(fg, bg, a);
-    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[4 + 3] = 255u;
-    src_argb0 += 8;
+    src_argb += 8;
     src_argb1 += 8;
     dst_argb += 8;
   }
 
   if (width & 1) {
-    uint32_t fb = src_argb0[0];
-    uint32_t fg = src_argb0[1];
-    uint32_t fr = src_argb0[2];
-    uint32_t a = src_argb0[3];
+    uint32_t fb = src_argb[0];
+    uint32_t fg = src_argb[1];
+    uint32_t fr = src_argb[2];
+    uint32_t a = src_argb[3];
     uint32_t bb = src_argb1[0];
     uint32_t bg = src_argb1[1];
     uint32_t br = src_argb1[2];
-    dst_argb[0] = BLEND(fb, bb, a);
-    dst_argb[1] = BLEND(fg, bg, a);
-    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a));
+    dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a));
+    dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a));
     dst_argb[3] = 255u;
   }
 }
@@ -2385,10 +3449,9 @@ void BlendPlaneRow_C(const uint8_t* src0,
 }
 #undef UBLEND
 
-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#define ATTENUATE(f, a) (f * a + 255) >> 8
 
 // Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
@@ -2399,7 +3462,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     b = src_argb[4];
     g = src_argb[5];
     r = src_argb[6];
@@ -2407,7 +3470,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[4] = ATTENUATE(b, a);
     dst_argb[5] = ATTENUATE(g, a);
     dst_argb[6] = ATTENUATE(r, a);
-    dst_argb[7] = a;
+    dst_argb[7] = STATIC_CAST(uint8_t, a);
     src_argb += 8;
     dst_argb += 8;
   }
@@ -2420,7 +3483,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
-    dst_argb[3] = a;
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
   }
 }
 #undef ATTENUATE
@@ -2472,6 +3535,14 @@ const uint32_t fixed_invtbl8[256] = {
     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
+#if LIBYUV_UNATTENUATE_DUP
+// This code mimics the Intel SIMD version for better testability.
+#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
+#else
+#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
+#endif
+
+// mimics the Intel SIMD code for exactness.
 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
@@ -2482,14 +3553,12 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb,
     uint32_t r = src_argb[2];
     const uint32_t a = src_argb[3];
     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
-    b = (b * ia) >> 8;
-    g = (g * ia) >> 8;
-    r = (r * ia) >> 8;
+
     // Clamping should not be necessary but is free in assembly.
-    dst_argb[0] = clamp255(b);
-    dst_argb[1] = clamp255(g);
-    dst_argb[2] = clamp255(r);
-    dst_argb[3] = a;
+    dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia));
+    dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia));
+    dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia));
+    dst_argb[3] = STATIC_CAST(uint8_t, a);
     src_argb += 4;
     dst_argb += 4;
   }
@@ -2519,13 +3588,24 @@ void CumulativeSumToAverageRow_C(const int32_t* tl,
                                  int area,
                                  uint8_t* dst,
                                  int count) {
-  float ooa = 1.0f / area;
+  float ooa;
   int i;
+  assert(area != 0);
+
+  ooa = 1.0f / STATIC_CAST(float, area);
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) *
+                  ooa);
+    dst[1] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) *
+                  ooa);
+    dst[2] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) *
+                  ooa);
+    dst[3] =
+        (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) *
+                  ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -2576,6 +3656,19 @@ static void HalfRow_16_C(const uint16_t* src_uv,
   }
 }
 
+static void HalfRow_16To8_C(const uint16_t* src_uv,
+                            ptrdiff_t src_uv_stride,
+                            uint8_t* dst_uv,
+                            int scale,
+                            int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = STATIC_CAST(
+        uint8_t,
+        C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale));
+  }
+}
+
 // C version 2x2 -> 2x1.
 void InterpolateRow_C(uint8_t* dst_ptr,
                       const uint8_t* src_ptr,
@@ -2586,6 +3679,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   int x;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
   if (y1_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width);
     return;
@@ -2594,21 +3690,17 @@ void InterpolateRow_C(uint8_t* dst_ptr,
     HalfRow_C(src_ptr, src_stride, dst_ptr, width);
     return;
   }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-    dst_ptr[1] =
-        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] =
-        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+  for (x = 0; x < width; ++x) {
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
+    ++src_ptr;
+    ++src_ptr1;
+    ++dst_ptr;
   }
 }
 
+// C version 2x2 -> 2x1.
 void InterpolateRow_16_C(uint16_t* dst_ptr,
                          const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
@@ -2618,23 +3710,65 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;
   int x;
-  if (source_y_fraction == 0) {
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
+  if (y1_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width * 2);
     return;
   }
-  if (source_y_fraction == 128) {
+  if (y1_fraction == 128) {
     HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
     return;
   }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
+  for (x = 0; x < width; ++x) {
+    dst_ptr[0] = STATIC_CAST(
+        uint16_t,
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
+    ++src_ptr;
+    ++src_ptr1;
+    ++dst_ptr;
   }
-  if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+}
+
+// C version 2x2 16 bit-> 2x1 8 bit.
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+
+void InterpolateRow_16To8_C(uint8_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int scale,
+                            int width,
+                            int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int x;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
+  if (source_y_fraction == 0) {
+    Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
+    return;
+  }
+  for (x = 0; x < width; ++x) {
+    dst_ptr[0] = STATIC_CAST(
+        uint8_t,
+        C16TO8(
+            (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+            scale));
+    src_ptr += 1;
+    src_ptr1 += 1;
+    dst_ptr += 1;
   }
 }
 
@@ -2743,10 +3877,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb,
     dr += poly[14] * r3;
     da += poly[15] * a3;
 
-    dst_argb[0] = Clamp((int32_t)(db));
-    dst_argb[1] = Clamp((int32_t)(dg));
-    dst_argb[2] = Clamp((int32_t)(dr));
-    dst_argb[3] = Clamp((int32_t)(da));
+    dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db)));
+    dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg)));
+    dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr)));
+    dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da)));
     src_argb += 4;
     dst_argb += 4;
   }
@@ -2873,7 +4007,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048
 
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
@@ -3151,6 +4285,32 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#if defined(HAS_I444TORGB24ROW_AVX2)
+void I444ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth;
+    src_v += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
 #if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
@@ -3175,12 +4335,93 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_RAWTOYJROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction) {
+  // Row buffer for intermediate 16 bit pixels.
+  SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
+    Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
+    src_ptr += twidth;
+    dst_ptr += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_INTERPOLATEROW_16TO8_AVX2
+
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
   float fsum = 0.f;
   int i;
-#if defined(__clang__)
-#pragma clang loop vectorize_width(4)
-#endif
   for (i = 0; i < width; ++i) {
     float v = *src++;
     fsum += v * v;
@@ -3211,8 +4452,9 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    *dst++ =
-        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    *dst++ = STATIC_CAST(
+        uint16_t,
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8);
     ++src;
   }
 }
@@ -3231,6 +4473,29 @@ void GaussCol_C(const uint16_t* src0,
   }
 }
 
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+             (1.0f / 256.0f);
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+                    const float* src1,
+                    const float* src2,
+                    const float* src3,
+                    const float* src4,
+                    float* dst,
+                    int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_C(const uint8_t* src_y,
                       const uint8_t* src_vu,
@@ -3256,13 +4521,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
 }
 
 // Filter 2 rows of AYUV UV's (444) into UV (420).
+// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
                    int src_stride_ayuv,
                    uint8_t* dst_uv,
                    int width) {
   // Output a row of UV values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
                  src_ayuv[src_stride_ayuv + 5] + 2) >>
                 2;
@@ -3273,12 +4539,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv,
     dst_uv += 2;
   }
   if (width & 1) {
-    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
   }
 }
 
@@ -3289,7 +4551,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
                    int width) {
   // Output a row of VU values, filtering 2x2 rows of AYUV.
   int x;
-  for (x = 0; x < width; x += 2) {
+  for (x = 0; x < width - 1; x += 2) {
     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
                  src_ayuv[src_stride_ayuv + 4] + 2) >>
                 2;
@@ -3300,12 +4562,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
     dst_vu += 2;
   }
   if (width & 1) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
-                 src_ayuv[src_stride_ayuv + 0] + 2) >>
-                2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
-                 src_ayuv[src_stride_ayuv + 1] + 2) >>
-                2;
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
   }
 }
 
@@ -3319,7 +4577,8 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
   }
 }
 
-void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t u = src_uv[0];
@@ -3331,19 +4590,32 @@ void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
   }
 }
 
-// divide values by weights and provide mask to indicate weight of 0.
-void FloatDivToByteRow_C(const float* src_weights,
-                         const float* src_values,
-                         uint8_t* dst_out,
-                         uint8_t* dst_mask,
-                         int width) {
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width) {
   int x;
-  for (x = 0; x < width; ++x) {
-    dst_out[x] = Clamp(src_values[x] / src_weights[x]);
-    dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+                 src_u[src_stride_u + 1] + 2) >>
+                2;
+    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+                 src_v[src_stride_v + 1] + 2) >>
+                2;
+    src_u += 2;
+    src_v += 2;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
   }
 }
 
+#undef STATIC_CAST
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
new file mode 100644
index 00000000..d8074987
--- /dev/null
+++ b/source/row_gcc.cc
@@ -0,0 +1,9744 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
+
+// JPeg full range.
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+                                29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
+                                77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+                                0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
+                               -43, -84, 127, 0, -43, -84, 127, 0};
+
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
+
+static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
+                               127, -107, -20, 0, 127, -107, -20, 0};
+
+// Constants for BGRA
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
+                                            14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm0,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
+      "psrld       $0x18,%%xmm5                  \n"
+      "movdqa      %3,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "palignr     $0x8,%%xmm1,%%xmm2            \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "palignr     $0xc,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "palignr     $0x4,%%xmm3,%%xmm3            \n"
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm3,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_rgba),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToRGBA)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
+      "movdqa      %5,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x4(%0),%%xmm1                \n"
+      "movdqu      0x8(%0),%%xmm2                \n"
+      "lea         0x18(%0),%0                   \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x20802080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xa,%%xmm4                   \n"
+      "psrlw       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0x1080108,%%eax              \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x42004200,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "movdqa      %%xmm3,%%xmm4                 \n"
+      "psrlw       $0x6,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "psllw       $0x1,%%xmm1                   \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "pand        %%xmm7,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov         $0xf0f0f0f,%%eax              \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x4,%%xmm5                   \n"
+      "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "psllw       $0x4,%%xmm1                   \n"
+      "psrlw       $0x4,%%xmm3                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa     %4,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+r"(width)                    // %2
+      : "m"(kShuffleMaskARGBToRGB24),  // %3
+        "m"(kPermdRGB24_AVX)           // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
+    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vmovdqa     %3,%%ymm5                     \n"
+      "vmovdqa     %4,%%ymm6                     \n"
+      "vmovdqa     %5,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
+      "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
+      "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                // %0
+        "+r"(dst),                // %1
+        "+r"(width)               // %2
+      : "m"(kPermARGBToRGB24_0),  // %3
+        "m"(kPermARGBToRGB24_1),  // %4
+        "m"(kPermARGBToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa     %4,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
+      "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
+      "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
+      "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
+      "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
+      "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
+      "vpermq      $0x93,%%ymm3,%%ymm3           \n"
+      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
+      "vmovdqu     %%ymm2,0x40(%1)               \n"
+      "lea         0x60(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                   // %0
+        "+r"(dst),                   // %1
+        "+r"(width)                  // %2
+      : "m"(kShuffleMaskARGBToRAW),  // %3
+        "m"(kPermdRGB24_AVX)         // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd        %3,%%xmm6                     \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "punpcklwd   %%xmm6,%%xmm6                 \n"
+      "punpckhwd   %%xmm7,%%xmm7                 \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "paddusb     %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
+      "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
+      "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
+      "vpslld      $0x5,%%ymm4,%%ymm4            \n"
+      "vpslld      $0xb,%%ymm3,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
+      "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+      "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1b,%%xmm4                  \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x5,%%xmm5                   \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pslld       $0xa,%%xmm6                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "pslld       $0xf,%%xmm7                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x6,%%xmm2                   \n"
+      "psrld       $0x9,%%xmm3                   \n"
+      "pand        %%xmm7,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "pand        %%xmm6,%%xmm3                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm3,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "psrlq       $0x4,%%xmm0                   \n"
+      "psrlq       $0x8,%%xmm1                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
+would be a simple multiplier to shift it into position.  It wants a gap of 10
+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      "movd        %4,%%xmm3                     \n"  // multipler for RB
+      "movd        %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd        %6,%%xmm5                     \n"  // mask for AG
+      "movd        %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand        %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add         $0x10,%0                      \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
+      "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
+      "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
+      "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
+      "sub         %0,%1                         \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
+      "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
+      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
+      "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
+      "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
+      "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
+      "add         $0x20,%0                      \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
+                                           6, 6, 5, 5, 4, 4, 7, 7};
+static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
+                                           14, 14, 13, 13, 12, 12, 15, 15};
+
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ar64,
+                         int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+                         uint16_t* dst_ab64,
+                         int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm2                     \n"
+      "movdqa      %4,%%xmm3                     \n" LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm0                 \n"
+      "pshufb      %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),             // %0
+        "+r"(dst_ab64),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleARGBToAB64Lo),  // %3
+        "m"(kShuffleARGBToAB64Hi)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrlw       $8,%%xmm0                     \n"
+      "psrlw       $8,%%xmm1                     \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+
+      "movdqa      %3,%%xmm2                     \n" LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrlw       $8,%%xmm0                     \n"
+      "psrlw       $8,%%xmm1                     \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "pshufb      %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm2                  \n"
+      "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
+      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),             // %0
+        "+r"(dst_ab64),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleARGBToAB64Lo),  // %3
+        "m"(kShuffleARGBToAB64Hi)   // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x40(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_AVX2
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x40(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round)                            \
+  "1:                                        \n" \
+  "movdqu    (%0),%%xmm0                     \n" \
+  "movdqu    0x10(%0),%%xmm1                 \n" \
+  "movdqu    0x20(%0),%%xmm2                 \n" \
+  "movdqu    0x30(%0),%%xmm3                 \n" \
+  "psubb     %%xmm5,%%xmm0                   \n" \
+  "psubb     %%xmm5,%%xmm1                   \n" \
+  "psubb     %%xmm5,%%xmm2                   \n" \
+  "psubb     %%xmm5,%%xmm3                   \n" \
+  "movdqu    %%xmm4,%%xmm6                   \n" \
+  "pmaddubsw %%xmm0,%%xmm6                   \n" \
+  "movdqu    %%xmm4,%%xmm0                   \n" \
+  "pmaddubsw %%xmm1,%%xmm0                   \n" \
+  "movdqu    %%xmm4,%%xmm1                   \n" \
+  "pmaddubsw %%xmm2,%%xmm1                   \n" \
+  "movdqu    %%xmm4,%%xmm2                   \n" \
+  "pmaddubsw %%xmm3,%%xmm2                   \n" \
+  "lea       0x40(%0),%0                     \n" \
+  "phaddw    %%xmm0,%%xmm6                   \n" \
+  "phaddw    %%xmm2,%%xmm1                   \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
+  "psrlw     $0x8,%%xmm6                     \n" \
+  "psrlw     $0x8,%%xmm1                     \n" \
+  "packuswb  %%xmm1,%%xmm6                   \n" \
+  "movdqu    %%xmm6,(%1)                     \n" \
+  "lea       0x10(%1),%1                     \n" \
+  "sub       $0x10,%2                        \n" \
+  "jg        1b                              \n"
+
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
+  "vzeroupper                                \n"
+
+// clang-format on
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ABGRTOYJROW_SSSE3
+// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
+// Same as ABGRToYRow but different coefficients, no add 16.
+void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ABGRTOYJROW_SSSE3
+
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN RGBTOY(xmm5)
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_SSSE3
+
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+#endif
+
+#ifdef HAS_ARGBTOYROW_AVX2
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vbroadcastf128 %5,%%ymm7                  \n"
+      "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm7) "vzeroupper                                \n"
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToY),          // %3
+        "m"(kSub128),           // %4
+        "m"(kAddY16),           // %5
+        "m"(kPermdARGBToY_AVX)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ABGRTOYJROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOYJROW_AVX2
+
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+      ymm5) "vzeroupper                                \n"
+      : "+r"(src_rgba),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kRGBAToYJ),         // %3
+        "m"(kSub128),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_RGBATOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+    defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kABGRToV),                     // %6
+        "m"(kABGRToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kSub128),                      // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
+#ifdef HAS_ABGRTOUVJROW_AVX2
+void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x40(%0),%%ymm2               \n"
+      "vmovdqu     0x60(%0),%%ymm3               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+      "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+      "lea         0x80(%0),%0                   \n"
+      "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+      "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+      "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+      "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+      "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+      "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kSub128),                      // %5
+        "m"(kABGRToVJ),                    // %6
+        "m"(kABGRToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kSub128)                       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ABGRTOUVJROW_SSSE3
+void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
+                        int src_stride_abgr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToVJ),                    // %5
+        "m"(kABGRToUJ),                    // %6
+        "m"(kSub128)                       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif  // HAS_ABGRTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "movdqa      %4,%%xmm3                     \n"
+      "movdqa      %5,%%xmm4                     \n"
+      "movdqa      %6,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm2                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "packsswb    %%xmm2,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "lea         0x40(%0),%0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_bgra),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN RGBTOY(xmm7)
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kSub128),    // %4
+        "m"(kAddY16)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_abgr),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa      %5,%%xmm3                     \n"
+      "movdqa      %6,%%xmm4                     \n"
+      "movdqa      %7,%%xmm5                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm1                 \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqu      0x30(%0),%%xmm6               \n"
+      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+
+      "lea         0x40(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "shufps      $0x88,%%xmm6,%%xmm2           \n"
+      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "phaddw      %%xmm2,%%xmm0                 \n"
+      "phaddw      %%xmm6,%%xmm1                 \n"
+      "psraw       $0x8,%%xmm0                   \n"
+      "psraw       $0x8,%%xmm1                   \n"
+      "packsswb    %%xmm1,%%xmm0                 \n"
+      "paddb       %%xmm5,%%xmm0                 \n"
+      "movlps      %%xmm0,(%1)                   \n"
+      "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgba),                    // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+#define READYUVA210                                               \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
+  "movdqu     (%[a_buf]),%%xmm5                               \n" \
+  "psraw      $2,%%xmm5                                       \n" \
+  "packuswb   %%xmm5,%%xmm5                                   \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
+
+// Read 8 UV from 444 10 bit
+#define READYUV410                                                \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
+  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "psraw      $2,%%xmm2                                       \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 8 UV from 444 10 bit.  With 8 Alpha.
+#define READYUVA410                                               \
+  "movdqu     (%[u_buf]),%%xmm3                               \n" \
+  "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
+  "psraw      $2,%%xmm3                                       \n" \
+  "psraw      $2,%%xmm2                                       \n" \
+  "movdqa     %%xmm3,%%xmm1                                   \n" \
+  "punpcklwd  %%xmm2,%%xmm3                                   \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                   \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $6,%%xmm4                                       \n" \
+  "psrlw      $4,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
+  "movdqu     (%[a_buf]),%%xmm5                               \n" \
+  "psraw      $2,%%xmm5                                       \n" \
+  "packuswb   %%xmm5,%%xmm5                                   \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
+
+// Read 4 UV from 422 12 bit, upsample to 8 UV
+#define READYUV212                                                \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm3                                   \n" \
+  "psraw      $0x4,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "movdqa     %%xmm4,%%xmm2                                   \n" \
+  "psllw      $4,%%xmm4                                       \n" \
+  "psrlw      $8,%%xmm2                                       \n" \
+  "paddw      %%xmm2,%%xmm4                                   \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm3                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
+
+// Read 8 UV from 444.  With 8 Alpha.
+#define READYUVA444                                               \
+  "movq       (%[u_buf]),%%xmm3                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm3                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm3                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm3                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
+
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210                                                  \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
+  "packuswb   %%xmm3,%%xmm3                                   \n" \
+  "punpcklwd  %%xmm3,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+// Read 8 UV from P410
+#define READP410                                                  \
+  "movdqu     (%[uv_buf]),%%xmm3                              \n" \
+  "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
+  "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
+  "psrlw      $0x8,%%xmm3                                     \n" \
+  "psrlw      $0x8,%%xmm1                                     \n" \
+  "packuswb   %%xmm1,%%xmm3                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "pxor       %%xmm12,%%xmm12                                 \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "psllw      $7,%%xmm13                                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "pshufb     %%xmm12,%%xmm13                                 \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants)                                  \
+  "psubb      %%xmm13,%%xmm3                                  \n" \
+  "pmulhuw    %%xmm11,%%xmm4                                  \n" \
+  "movdqa     %%xmm8,%%xmm0                                   \n" \
+  "movdqa     %%xmm9,%%xmm1                                   \n" \
+  "movdqa     %%xmm10,%%xmm2                                  \n" \
+  "paddw      %%xmm12,%%xmm4                                  \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"
+
+#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants)                                  \
+  "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
+  "pxor       %%xmm1,%%xmm1                                   \n" \
+  "psllw      $7,%%xmm0                                       \n" \
+  "pshufb     %%xmm1,%%xmm0                                   \n" \
+  "psubb      %%xmm0,%%xmm3                                   \n" \
+  "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
+  "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
+  "paddw      %%xmm3,%%xmm4                                   \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"
+
+#define YUVTORGB_REGS
+#endif
+
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values.
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
+
+// Store 8 RGBA values.
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
+
+// Store 8 RGB24 values.
+#define STORERGB24                                                      \
+  "punpcklbw   %%xmm1,%%xmm0                                        \n" \
+  "punpcklbw   %%xmm2,%%xmm2                                        \n" \
+  "movdqa      %%xmm0,%%xmm1                                        \n" \
+  "punpcklwd   %%xmm2,%%xmm0                                        \n" \
+  "punpckhwd   %%xmm2,%%xmm1                                        \n" \
+  "pshufb      %%xmm5,%%xmm0                                        \n" \
+  "pshufb      %%xmm6,%%xmm1                                        \n" \
+  "palignr     $0xc,%%xmm0,%%xmm1                                   \n" \
+  "movq        %%xmm0,(%[dst_rgb24])                                \n" \
+  "movdqu      %%xmm1,0x8(%[dst_rgb24])                             \n" \
+  "lea         0x18(%[dst_rgb24]),%[dst_rgb24]                      \n"
+
+// Store 8 AR30 values.
+#define STOREAR30                                                  \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+  YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+  LABELALIGN
+      "1:                                        \n"
+  READYUVA444
+  YUVTORGB(yuvconstants)
+  STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I444ALPHATOARGBROW_SSSE3
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STORERGB24
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+      "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STORERGB24
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+// 12 bit YUV to ARGB
+void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 12 bit YUV to AR30
+void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                                     const uint16_t* u_buf,
+                                     const uint16_t* v_buf,
+                                     const uint16_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA210
+          YUVTORGB(yuvconstants) STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),  // %[y_buf]
+        [u_buf] "+r"(u_buf),  // %[u_buf]
+        [v_buf] "+r"(v_buf),  // %[v_buf]
+        [a_buf] "+r"(a_buf),
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+                                     const uint16_t* u_buf,
+                                     const uint16_t* v_buf,
+                                     const uint16_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile(
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA410
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+    : [y_buf] "+r"(y_buf),  // %[y_buf]
+      [u_buf] "+r"(u_buf),  // %[u_buf]
+      [v_buf] "+r"(v_buf),  // %[v_buf]
+      [a_buf] "+r"(a_buf),
+      [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+      [width] "+m"(width)  // %[width]
+#else
+      [width] "+rm"(width)  // %[width]
+#endif
+    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "xmm5");
+  // clang-format on
+}
+#endif
+
+// 10 bit YUV to AR30
+void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READNV12
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* vu_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READNV21
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUY2
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READUYVY
+    YUVTORGB(yuvconstants)
+    STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+
+void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN "1:                                        \n" READP210
+          YUVTORGB(yuvconstants) STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+
+void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile(
+      YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN "1:                                        \n" READP410
+          YUVTORGB(yuvconstants) STOREARGB
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
+}
+
+void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),              // %[y_buf]
+    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
+    [width]"+rm"(width)              // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* uv_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $14,%%xmm5                    \n"
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),              // %[y_buf]
+    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
+    [width]"+rm"(width)              // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STORERGBA
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+#define READYUV422_AVX512BW                                           \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "vpermq     %%zmm3,%%zmm16,%%zmm3                               \n" \
+  "vpermq     %%zmm1,%%zmm16,%%zmm1                               \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpunpcklbw %%zmm1,%%zmm3,%%zmm3                                \n" \
+  "vpermq     $0xd8,%%zmm3,%%zmm3                                 \n" \
+  "vpunpcklwd %%zmm3,%%zmm3,%%zmm3                                \n" \
+  "vmovdqu8   (%[y_buf]),%%ymm4                                   \n" \
+  "vpermq     %%zmm4,%%zmm17,%%zmm4                               \n" \
+  "vpermq     $0xd8,%%zmm4,%%zmm4                                 \n" \
+  "vpunpcklbw %%zmm4,%%zmm4,%%zmm4                                \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 210, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
+#define READYUVA210_AVX2                                           \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
+  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
+  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
+  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
+  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
+
+// Read 16 UV from 410
+#define READYUV410_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
+  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 8 UV from 212 12 bit, upsample to 16 UV
+#define READYUV212_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $4,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $8,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+
+// Read 16 UV from 410. With 16 Alpha.
+#define READYUVA410_AVX2                                           \
+  "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
+  "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
+  "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
+  "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
+  "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
+  "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
+  "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
+  "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
+  "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
+  "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
+  "lea        0x20(%[a_buf]),%[a_buf]                          \n"
+
+// Read 16 UV from 444.  With 16 Alpha.
+#define READYUVA444_AVX2                                              \
+  "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm3                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
+  "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
+  "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from P410
+#define READP410_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
+  "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
+  "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
+  "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
+  "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
+  "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
+
+// TODO(fbarchard): Remove broadcastb
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
+  "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
+  "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
+  "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
+
+#define YUVTORGB_SETUP_AVX512BW(yuvconstants)                         \
+  "vpcmpeqb   %%xmm13,%%xmm13,%%xmm13                             \n" \
+  "movdqa     (%[yuvconstants]),%%xmm8                            \n" \
+  "vpbroadcastq %%xmm8, %%zmm8                                    \n" \
+  "vpsllw     $7,%%xmm13,%%xmm13                                  \n" \
+  "vpbroadcastb %%xmm13,%%zmm13                                   \n" \
+  "movq     32(%[yuvconstants]),%%xmm9                            \n" \
+  "vpbroadcastq %%xmm9,%%zmm9                                     \n" \
+  "movq     64(%[yuvconstants]),%%xmm10                           \n" \
+  "vpbroadcastq %%xmm10,%%zmm10                                   \n" \
+  "movq     96(%[yuvconstants]),%%xmm11                           \n" \
+  "vpbroadcastq %%xmm11,%%zmm11                                   \n" \
+  "movq     128(%[yuvconstants]),%%xmm12                          \n" \
+  "vpbroadcastq %%xmm12,%%zmm12                                   \n" \
+  "vmovdqu8 (%[quadsplitperm]),%%zmm16                            \n" \
+  "vmovdqu8 (%[dquadsplitperm]),%%zmm17                           \n" \
+  "vmovdqu8 (%[unperm]),%%zmm18                                   \n"
+
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
+  "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
+  "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
+  "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB16_AVX512BW(yuvconstants)                             \
+  "vpsubb      %%zmm13,%%zmm3,%%zmm3                              \n" \
+  "vpmulhuw    %%zmm11,%%zmm4,%%zmm4                              \n" \
+  "vpmaddubsw  %%zmm3,%%zmm8,%%zmm0                               \n" \
+  "vpmaddubsw  %%zmm3,%%zmm9,%%zmm1                               \n" \
+  "vpmaddubsw  %%zmm3,%%zmm10,%%zmm2                              \n" \
+  "vpaddw      %%zmm4,%%zmm12,%%zmm4                              \n" \
+  "vpaddsw     %%zmm4,%%zmm0,%%zmm0                               \n" \
+  "vpsubsw     %%zmm1,%%zmm4,%%zmm1                               \n" \
+  "vpaddsw     %%zmm4,%%zmm2,%%zmm2                               \n"
+
+#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
+#define YUVTORGB_REGS_AVX512BW \
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
+
+#else  // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
+  "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
+  "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
+  "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
+  "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
+  "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
+  "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_REGS_AVX2
+#endif
+
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  YUVTORGB16_AVX2(yuvconstants)                                       \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_AVX512BW(yuvconstants)                               \
+  YUVTORGB16_AVX512BW(yuvconstants)                                   \
+  "vpsraw     $0x6,%%zmm0,%%zmm0                                  \n" \
+  "vpsraw     $0x6,%%zmm1,%%zmm1                                  \n" \
+  "vpsraw     $0x6,%%zmm2,%%zmm2                                  \n" \
+  "vpackuswb  %%zmm0,%%zmm0,%%zmm0                                \n" \
+  "vpackuswb  %%zmm1,%%zmm1,%%zmm1                                \n" \
+  "vpackuswb  %%zmm2,%%zmm2,%%zmm2                                \n"
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
+
+// Store 32 ARGB values.
+#define STOREARGB_AVX512BW                                            \
+  "vpunpcklbw %%zmm1,%%zmm0,%%zmm0                                \n" \
+  "vpermq     %%zmm0,%%zmm18,%%zmm0                               \n" \
+  "vpunpcklbw %%zmm5,%%zmm2,%%zmm2                                \n" \
+  "vpermq     %%zmm2,%%zmm18,%%zmm2                               \n" \
+  "vpunpcklwd %%zmm2,%%zmm0,%%zmm1                                \n" \
+  "vpunpckhwd %%zmm2,%%zmm0,%%zmm0                                \n" \
+  "vmovdqu8   %%zmm1,(%[dst_argb])                                \n" \
+  "vmovdqu8   %%zmm0,0x40(%[dst_argb])                            \n" \
+  "lea        0x80(%[dst_argb]), %[dst_argb]                      \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2                                                \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV444_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
+static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
+static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// 32 pixels
+// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
+// bytes).
+void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
+                                   const uint8_t* u_buf,
+                                   const uint8_t* v_buf,
+                                   uint8_t* dst_argb,
+                                   const struct YuvConstants* yuvconstants,
+                                   int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX512BW(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%xmm5,%%xmm5,%%xmm5          \n"
+      "vpbroadcastq %%xmm5,%%zmm5                \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422_AVX512BW
+    YUVTORGB_AVX512BW(yuvconstants)
+    STOREARGB_AVX512BW
+      "sub         $0x20,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),                         // %[y_buf]
+    [u_buf]"+r"(u_buf),                         // %[u_buf]
+    [v_buf]"+r"(v_buf),                         // %[v_buf]
+    [dst_argb]"+r"(dst_argb),                   // %[dst_argb]
+    [width]"+rm"(width)                         // %[width]
+  : [yuvconstants]"r"(yuvconstants),            // %[yuvconstants]
+    [quadsplitperm]"r"(kSplitQuadWords),        // %[quadsplitperm]
+    [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
+    [unperm]"r"(kUnpermuteAVX512)               // %[unperm]
+  : "memory", "cc", YUVTORGB_REGS_AVX512BW
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX512BW
+
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I212TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I212TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I212TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV212_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I212TOAR30ROW_AVX2
+
+#if defined(HAS_I410TOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                                    const uint16_t* u_buf,
+                                    const uint16_t* v_buf,
+                                    const uint16_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA210_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
+}
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+                                    const uint16_t* u_buf,
+                                    const uint16_t* v_buf,
+                                    const uint16_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+
+      LABELALIGN "1:                                        \n" READYUVA410_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+        [width] "+m"(width)  // %[width]
+#else
+        [width] "+rm"(width)  // %[width]
+#endif
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
+}
+#endif  // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV410_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I410TOAR30ROW_AVX2
+
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+  YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+  LABELALIGN
+      "1:                                        \n"
+  READYUVA444_AVX2
+  YUVTORGB_AVX2(yuvconstants)
+  STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I444ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "sub         %[u_buf],%[v_buf]             \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READNV12_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* vu_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READYUY2_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READUYVY_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#if defined(HAS_P210TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_P210TOARGBROW_AVX2
+
+#if defined(HAS_P410TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_P410TOARGBROW_AVX2
+
+#if defined(HAS_P210TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_P210TOAR30ROW_AVX2
+
+#if defined(HAS_P410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* uv_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+
+    LABELALIGN
+      "1:                                        \n"
+    READP410_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_P410TOAR30ROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
+      "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
+      "pslld       $0x18,%%xmm4                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "paddsw    %%xmm3,%%xmm0                   \n"
+      "psraw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
+      "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
+      "vpslld      $0x18,%%ymm4,%%ymm4           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea        0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),       // %0
+        "+r"(dst_argb),    // %1
+        "+rm"(width)       // %2
+      : "r"(yuvconstants)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "movdqa      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "movdqa      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+      "movdqa      %4,%%xmm1                     \n"
+      "lea         -0x10(%0,%3,2),%0             \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         -0x10(%0),%0                  \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"
+      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $8,%3                         \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(temp_width)            // %3
+      : "m"(kShuffleMirrorSplitUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored.  first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+                                         7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
+
+// Shuffle last 5 pixels to first 5 mirrored.  last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  src_rgb24 += width * 3 - 48;
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // first 5
+      "movdqu      15(%0),%%xmm1                 \n"  // next 5
+      "movdqu      30(%0),%%xmm2                 \n"  // next 5
+      "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"
+      "pshufb      %%xmm5,%%xmm3                 \n"
+      "lea         -0x30(%0),%0                  \n"
+      "movdqu      %%xmm0,32(%1)                 \n"  // last 5
+      "movdqu      %%xmm1,17(%1)                 \n"  // next 5
+      "movdqu      %%xmm2,2(%1)                  \n"  // next 5
+      "movlpd      %%xmm3,0(%1)                  \n"  // first 1
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb24),          // %0
+        "+r"(dst_rgb24),          // %1
+        "+r"(temp_width)          // %2
+      : "m"(kShuffleMirrorRGB0),  // %3
+        "m"(kShuffleMirrorRGB1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_RGB24MIRRORROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "lea         -0x10(%0,%2,4),%0             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+      "lea         -0x10(%0),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vmovdqu     %3,%%ymm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_DETILEROW_SSE2
+void DetileRow_SSE2(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "sub         $0x10,%2                      \n"
+      "lea         (%0,%3),%0                    \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "jg          1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0");
+}
+#endif  // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_SSE2
+void DetileRow_16_SSE2(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0", "xmm1");
+}
+#endif  // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILEROW_16_AVX
+void DetileRow_16_AVX(const uint16_t* src,
+                      ptrdiff_t src_tile_stride,
+                      uint16_t* dst,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         (%0,%3,2),%0                  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0");
+}
+#endif  // HAS_DETILEROW_AVX
+
+#ifdef HAS_DETILETOYUY2_SSE2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_SSE2(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
+      "sub         $0x10,%3                      \n"
+      "lea         (%0,%4),%0                    \n"
+      "movdqu      (%1),%%xmm1                   \n"  // Load 8 UV
+      "lea         (%1,%5),%1                    \n"
+      "movdqu      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "xmm0", "xmm1", "xmm2"  // Clobber list
+  );
+}
+#endif
+
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+// TODO(greenjustin): Look into generating these constants instead of loading
+// them since this can cause branch mispredicts for fPIC code on 32-bit
+// machines.
+static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
+                                     1, 3, 5, 7, 9, 11, 13, 15};
+
+// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
+// slow on older SSE2 processors.
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+                            ptrdiff_t src_tile_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
+  asm volatile(
+      "movdqu      %4,%%xmm1                     \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         (%0, %5),%0                   \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "movhps      %%xmm0,(%2)                   \n"
+      "lea         0x8(%2),%2                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_u),          // %1
+        "+r"(dst_v),          // %2
+        "+r"(width)           // %3
+      : "m"(kDeinterlaceUV),  // %4
+        "r"(src_tile_stride)  // %5
+      : "cc", "memory", "xmm0", "xmm1");
+}
+#endif  // HAS_DETILESPLITUVROW_SSSE3
+
+#ifdef HAS_MERGEUVROW_AVX512BW
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width) {
+      asm volatile("sub         %0,%1                         \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%0),%%zmm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
+      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
+      "vmovdqu64   %%zmm2,(%2)                   \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_AVX512BW
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+      asm volatile("sub         %0,%1                         \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%0),%%ymm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+      asm volatile("sub         %0,%1                         \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %4,%%xmm3                     \n"
+      "vmovd       %5,%%xmm4                     \n"
+
+
+      "sub         %0,%1                         \n"
+      // 8 pixels per loop.
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm0                   \n"
+      "vpmovzxwd   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
+      "vpslld      %%xmm4,%%ymm1,%%ymm1          \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_u),      // %0
+    "+r"(src_v),      // %1
+    "+r"(dst_uv),     // %2
+    "+r"(width)       // %3
+  : "r"(16 - depth),  // %4
+    "r"(32 - depth)   // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
+                                 2, 3, 6, 7, 10, 11, 14, 15};
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  depth = 16 - depth;
+  // clang-format off
+  asm volatile (
+      "vmovd       %4,%%xmm3                     \n"
+      "vbroadcastf128 %5,%%ymm4                  \n"
+      "sub         %1,%2                         \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "add         $0x40,%0                      \n"
+
+      "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
+      "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
+      "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_uv),   // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(width)     // %3
+  : "r"(depth),     // %4
+    "m"(kSplitUVShuffle16) // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+  // clang-format on
+}
+#endif  // HAS_SPLITUVROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "add         $0x40,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert msb formats to lsb, depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// 65536 = 16 bits
+#ifdef HAS_DIVIDEROW_16_AVX2
+void DivideRow_16_AVX2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm3                     \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "sub         %0,%1                         \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "add         $0x40,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width),    // %2
+    "+r"(scale)     // %3
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "add         $0x20,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "add         $0x10,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "add         $0x40,%0                      \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "add         $0x20,%1                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
+      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "add         $0x20,%0                      \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kSplitRGBShuffle[9] = {
+    {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
+     7u, 10u, 13u},
+    {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
+     8u, 11u, 14u},
+    {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
+     12u, 15u}};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      0(%5), %%xmm0                 \n"
+      "pshufb      16(%5), %%xmm1                \n"
+      "pshufb      32(%5), %%xmm2                \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      48(%5),%%xmm0                 \n"
+      "pshufb      64(%5),%%xmm1                 \n"
+      "pshufb      80(%5), %%xmm2                \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "pshufb      96(%5), %%xmm0                \n"
+      "pshufb      112(%5), %%xmm1               \n"
+      "pshufb      128(%5), %%xmm2               \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "lea         0x30(%0),%0                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "r"(&kSplitRGBShuffle[0])  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+// Shuffle table for converting Planar to RGB.
+static const uvec8 kMergeRGBShuffle[9] = {
+    {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
+     128u, 5u},
+    {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
+     128u, 128u},
+    {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
+     4u, 128u},
+    {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
+     10u, 128u},
+    {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
+     128u, 10u},
+    {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
+     128u, 128u},
+    {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
+     15u, 128u, 128u},
+    {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
+     128u, 15u, 128u},
+    {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
+     128u, 128u, 15u}};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      (%5), %%xmm0                  \n"
+      "pshufb      16(%5), %%xmm1                \n"
+      "pshufb      32(%5), %%xmm2                \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      48(%5), %%xmm0                \n"
+      "pshufb      64(%5), %%xmm1                \n"
+      "pshufb      80(%5), %%xmm2                \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,16(%3)                 \n"
+
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "pshufb      96(%5), %%xmm0                \n"
+      "pshufb      112(%5), %%xmm1               \n"
+      "pshufb      128(%5), %%xmm2               \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,32(%3)                 \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x30(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "r"(&kMergeRGBShuffle[0])  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_MERGEARGBROW_SSE2
+void MergeARGBRow_SSE2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movq        (%0,%2),%%xmm0                \n"  // B
+      "movq        (%0),%%xmm1                   \n"  // R
+      "movq        (%0,%1),%%xmm2                \n"  // G
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
+      "movq        (%0,%3),%%xmm1                \n"  // A
+      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
+      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
+      "movdqu      %%xmm0,(%4)                   \n"
+      "movdqu      %%xmm1,16(%4)                 \n"
+
+      "lea         8(%0),%0                      \n"
+      "lea         32(%4),%4                     \n"
+      "sub         $0x8,%5                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_SSE2
+void MergeXRGBRow_SSE2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movq        (%2),%%xmm0                   \n"  // B
+      "movq        (%0),%%xmm1                   \n"  // R
+      "movq        (%1),%%xmm2                   \n"  // G
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
+      "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
+      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
+      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
+      "movdqu      %%xmm0,(%3)                   \n"
+      "movdqu      %%xmm1,16(%3)                 \n"
+
+      "lea         8(%0),%0                      \n"
+      "lea         8(%1),%1                      \n"
+      "lea         8(%2),%2                      \n"
+      "lea         32(%3),%3                     \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEARGBROW_SSE2
+
+#ifdef HAS_MERGEARGBROW_AVX2
+void MergeARGBRow_AVX2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0,%2),%%xmm0                \n"  // B
+      "vmovdqu     (%0,%1),%%xmm1                \n"  // R
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
+      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
+      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
+      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
+      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
+      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
+      "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
+      "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
+
+      "lea         16(%0),%0                     \n"
+      "lea         64(%4),%4                     \n"
+      "sub         $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_AVX2
+void MergeXRGBRow_AVX2(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%2),%%xmm0                   \n"  // B
+      "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
+      "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
+      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
+      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
+      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
+      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
+      "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
+      "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
+
+      "lea         16(%0),%0                     \n"
+      "lea         16(%1),%1                     \n"
+      "lea         16(%2),%2                     \n"
+      "lea         64(%3),%3                     \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGEARGBROW_AVX2
+
+#ifdef HAS_SPLITARGBROW_SSE2
+void SplitARGBRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+      "sub         %1,%3                         \n"
+      "sub         %1,%4                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
+      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
+      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
+      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
+      "movlps      %%xmm0,(%1,%3)                \n"  // B
+      "movhps      %%xmm0,(%1,%2)                \n"  // G
+      "movlps      %%xmm2,(%1)                   \n"  // R
+      "movhps      %%xmm2,(%1,%4)                \n"  // A
+
+      "lea         32(%0),%0                     \n"
+      "lea         8(%1),%1                      \n"
+      "sub         $0x8,%5                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+r"(dst_a),     // %4
+        "+rm"(width)     // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSE2
+void SplitXRGBRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
+      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
+      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
+      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
+      "movlps      %%xmm0,(%3)                   \n"  // B
+      "movhps      %%xmm0,(%2)                   \n"  // G
+      "movlps      %%xmm2,(%1)                   \n"  // R
+
+      "lea         32(%0),%0                     \n"
+      "lea         8(%1),%1                      \n"
+      "lea         8(%2),%2                      \n"
+      "lea         8(%3),%3                      \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15};
+#ifdef HAS_SPLITARGBROW_SSSE3
+void SplitARGBRow_SSSE3(const uint8_t* src_argb,
+                        uint8_t* dst_r,
+                        uint8_t* dst_g,
+                        uint8_t* dst_b,
+                        uint8_t* dst_a,
+                        int width) {
+  asm volatile(
+
+      "movdqa      %6,%%xmm3                     \n"
+      "sub         %1,%2                         \n"
+      "sub         %1,%3                         \n"
+      "sub         %1,%4                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
+      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
+      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
+      "movlps      %%xmm0,(%1,%3)                \n"  // B
+      "movhps      %%xmm0,(%1,%2)                \n"  // G
+      "movlps      %%xmm2,(%1)                   \n"  // R
+      "movhps      %%xmm2,(%1,%4)                \n"  // A
+
+      "lea         32(%0),%0                     \n"
+      "lea         8(%1),%1                      \n"
+      "subl        $0x8,%5                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+r"(dst_a),     // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(kShuffleMaskARGBSplit)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSSE3
+void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
+                        uint8_t* dst_r,
+                        uint8_t* dst_g,
+                        uint8_t* dst_b,
+                        int width) {
+  asm volatile(
+
+      "movdqa      %5,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
+      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
+      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
+      "movlps      %%xmm0,(%3)                   \n"  // B
+      "movhps      %%xmm0,(%2)                   \n"  // G
+      "movlps      %%xmm2,(%1)                   \n"  // R
+
+      "lea         32(%0),%0                     \n"
+      "lea         8(%1),%1                      \n"
+      "lea         8(%2),%2                      \n"
+      "lea         8(%3),%3                      \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),             // %0
+        "+r"(dst_r),                // %1
+        "+r"(dst_g),                // %2
+        "+r"(dst_b),                // %3
+        "+r"(width)                 // %4
+      : "m"(kShuffleMaskARGBSplit)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_AVX2
+static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
+void SplitARGBRow_AVX2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+      "sub         %1,%3                         \n"
+      "sub         %1,%4                         \n"
+      "vmovdqa     %7,%%ymm3                     \n"
+      "vbroadcastf128 %6,%%ymm4                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
+      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
+      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
+      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
+      "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
+      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
+      "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
+      "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
+      "lea         64(%0),%0                     \n"
+      "lea         16(%1),%1                     \n"
+      "subl        $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+r"(dst_a),     // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(kShuffleMaskARGBSplit),   // %6
+        "m"(kShuffleMaskARGBPermute)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_AVX2
+void SplitXRGBRow_AVX2(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      "vmovdqa     %6,%%ymm3                     \n"
+      "vbroadcastf128 %5,%%ymm4                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
+      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
+      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
+      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
+      "vmovdqu     %%xmm0,(%3)                   \n"  // B
+      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
+      "vmovdqu     %%xmm2,(%2)                   \n"  // G
+
+      "lea         64(%0),%0                     \n"
+      "lea         16(%1),%1                     \n"
+      "lea         16(%2),%2                     \n"
+      "lea         16(%3),%3                     \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_r),                  // %1
+        "+r"(dst_g),                  // %2
+        "+r"(dst_b),                  // %3
+        "+r"(width)                   // %4
+      : "m"(kShuffleMaskARGBSplit),   // %5
+        "m"(kShuffleMaskARGBPermute)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = depth - 10;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vmovd       %5,%%xmm4                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0,%1),%%ymm1                \n"
+      "vmovdqu     (%0,%2),%%ymm2                \n"
+      "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
+      "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
+      "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
+      "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
+      "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
+      "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
+      "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
+      "vpslld      $0xa,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
+      "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
+      "vmovdqu     %%ymm0,(%3)                   \n"
+      "vmovdqu     %%ymm3,0x20(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+#if defined(__i386__)
+      : "m"(shift)  // %5
+#else
+      : "rm"(shift)           // %5
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  mask = (mask << 16) + mask;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "vmovdqa     %8,%%ymm5                     \n"
+      "vmovd       %6,%%xmm6                     \n"
+      "vbroadcastss %7,%%ymm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
+      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
+      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
+      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
+      "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
+      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
+      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
+      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
+      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
+      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
+      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
+      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
+      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
+      "vmovdqu     %%ymm3,(%4)                   \n"
+      "vmovdqu     %%ymm2,0x20(%4)               \n"
+      "vmovdqu     %%ymm4,0x40(%4)               \n"
+      "vmovdqu     %%ymm1,0x60(%4)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x80(%4),%4                   \n"
+      "subl        $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(shift),            // %6
+        "m"(mask),             // %7
+        "m"(MergeAR64Permute)  // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  mask = (mask << 16) + mask;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vmovdqa     %7,%%ymm5                     \n"
+      "vmovd       %5,%%xmm6                     \n"
+      "vbroadcastss %6,%%ymm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
+      "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
+      "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
+      "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
+      "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
+      "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
+      "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
+      "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
+      "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
+      "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
+      "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
+      "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
+      "vmovdqu     %%ymm3,(%3)                   \n"
+      "vmovdqu     %%ymm2,0x20(%3)               \n"
+      "vmovdqu     %%ymm4,0x40(%3)               \n"
+      "vmovdqu     %%ymm1,0x60(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x80(%3),%3                   \n"
+      "subl        $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),           // %0
+        "+r"(src_g),           // %1
+        "+r"(src_b),           // %2
+        "+r"(dst_ar64),        // %3
+        "+r"(width)            // %4
+      : "m"(shift),            // %5
+        "m"(mask),             // %6
+        "m"(MergeAR64Permute)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
+                                            4, 12, 5, 13, 6, 14, 7, 15};
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = depth - 8;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "vbroadcastf128 %7,%%ymm5                  \n"
+      "vmovd       %6,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vmovdqu     (%0,%3),%%ymm3                \n"  // A
+      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
+      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
+      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
+      "vmovdqu     %%ymm2,(%4)                   \n"
+      "vmovdqu     %%ymm0,0x20(%4)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%4),%4                   \n"
+      "subl        $0x10,%5                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+#if defined(__i386__)
+        "+m"(width)  // %5
+#else
+        "+rm"(width)          // %5
+#endif
+      : "m"(shift),                 // %6
+        "m"(MergeARGB16To8Shuffle)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = depth - 8;
+  asm volatile(
+
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "vbroadcastf128 %6,%%ymm5                  \n"
+      "vmovd       %5,%%xmm6                     \n"
+      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // R
+      "vmovdqu     (%0,%1),%%ymm1                \n"  // G
+      "vmovdqu     (%0,%2),%%ymm2                \n"  // B
+      "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
+      "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
+      "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
+      "vmovdqu     %%ymm2,(%3)                   \n"
+      "vmovdqu     %%ymm0,0x20(%3)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%3),%3                   \n"
+      "subl        $0x10,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_r),                // %0
+        "+r"(src_g),                // %1
+        "+r"(src_b),                // %2
+        "+r"(dst_argb),             // %3
+        "+r"(width)                 // %4
+      : "m"(shift),                 // %5
+        "m"(MergeARGB16To8Shuffle)  // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "test        $0xf,%0                       \n"
+      "jne         2f                            \n"
+      "test        $0xf,%1                       \n"
+      "jne         2f                            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa      (%0),%%xmm0                   \n"
+      "movdqa      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,(%1)                   \n"
+      "movdqa      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         9f                            \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          2b                            \n"
+
+      LABELALIGN "9:                                        \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep         movsb                         \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "vmovdqu     0x20(%0),%%ymm2               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0), %%xmm0                  \n"
+      "movdqu      0x10(%0), %%xmm1              \n"
+      "lea         0x20(%0), %0                  \n"
+      "psrld       $0x18, %%xmm0                 \n"
+      "psrld       $0x18, %%xmm1                 \n"
+      "packssdw    %%xmm1, %%xmm0                \n"
+      "packuswb    %%xmm0, %%xmm0                \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1), %1                   \n"
+      "sub         $0x8, %2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa     %3,%%ymm4                     \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0), %%ymm0                  \n"
+      "vmovdqu     0x20(%0), %%ymm1              \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x40(%0), %%ymm2              \n"
+      "vmovdqu     0x60(%0), %%ymm3              \n"
+      "lea         0x80(%0), %0                  \n"
+      "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
+      "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
+      "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20, %2                     \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      "pslld       $0x18,%%xmm0                  \n"
+      "pcmpeqb     %%xmm1,%%xmm1                 \n"
+      "psrld       $0x8,%%xmm1                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm2                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm2,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm4                   \n"
+      "movdqu      0x10(%1),%%xmm5               \n"
+      "pand        %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm0,%%xmm3                 \n"
+      "pand        %%xmm1,%%xmm4                 \n"
+      "pand        %%xmm1,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm1                   \n"
+      "vpmovzxbd   0x8(%0),%%ymm2                \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpslld      $0x18,%%ymm1,%%ymm1           \n"
+      "vpslld      $0x18,%%ymm2,%%ymm2           \n"
+      "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
+      "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile(
+
+      "rep         stosl                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep         stosb                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile(
+
+      "rep         stosl                         \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_yuy2),               // %0
+                 "+r"(dst_uv),                 // %1
+                 "+r"(width)                   // %2
+               : "r"((intptr_t)(stride_yuy2))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrlw       $0x8,%%xmm5                   \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%3,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%3,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+      "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $0xf,%%xmm7                   \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "pslld       $0x18,%%xmm4                  \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          99f                           \n"
+
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd        (%0),%%xmm3                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "movdqa      %%xmm3,%%xmm0                 \n"
+      "pxor        %%xmm4,%%xmm3                 \n"
+      "movd        (%1),%%xmm2                   \n"
+      "pshufb      %4,%%xmm3                     \n"
+      "pand        %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm7,%%xmm3                 \n"
+      "pmullw      %%xmm3,%%xmm2                 \n"
+      "movd        (%1),%%xmm1                   \n"
+      "lea         0x4(%1),%1                    \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "paddusb     %%xmm2,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         91b                           \n"
+      "99:                                       \n"
+      : "+r"(src_argb),     // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psllw       $0x8,%%xmm5                   \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "movd        %%eax,%%xmm7                  \n"
+      "pshufd      $0x0,%%xmm7,%%xmm7            \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%2),%%xmm0                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm5,%%xmm0                 \n"
+      "movq        (%0,%2,1),%%xmm1              \n"
+      "movq        (%1,%2,1),%%xmm2              \n"
+      "punpcklbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%3,%2,1)              \n"
+      "lea         0x8(%2),%2                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm6                  \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov         $0x807f807f,%%eax             \n"
+      "vmovd       %%eax,%%xmm7                  \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub         %2,%0                         \n"
+      "sub         %2,%1                         \n"
+      "sub         %2,%3                         \n"
+
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%2),%%ymm0                   \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vmovdqu     (%0,%2,1),%%ymm1              \n"
+      "vmovdqu     (%1,%2,1),%%ymm2              \n"
+      "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
+      "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
+      "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%3,%2,1)              \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
+                                       -128, -128, 14,   -128, 14, -128,
+                                       14,   -128, -128, -128};
+
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "punpcklbw   %%xmm6,%%xmm7                 \n"
+      "sub         %0,%1                         \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqa      %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
+      "pshufb      %%xmm4,%%xmm3                 \n"
+      "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"  // + 255
+      "paddw       %%xmm7,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm6                 \n"
+      "por         %%xmm6,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0,%1)                \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_argb),         // %1
+        "+r"(width)             // %2
+      : "m"(kAttenuateShuffle)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+
+// Shuffle table duplicating alpha.
+static const lvec8 kAttenuateShuffle_AVX2 = {
+    6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
+    -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
+    -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vmovdqa     %3,%%ymm4                     \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpslld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
+      "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
+      "sub         %0,%1                         \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
+      "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
+      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(width)                  // %2
+      : "m"(kAttenuateShuffle_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movzb       0x03(%0),%3                   \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movd        0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "movd        0x00(%4,%3,4),%%xmm3          \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "movlhps     %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width) {
+  uintptr_t alpha;
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb       0x03(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x07(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x0b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x0f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "movzb       0x13(%0),%3                   \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm0          \n"
+      "movzb       0x17(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm1          \n"
+      "movzb       0x1b(%0),%3                   \n"
+      "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
+      "vmovd       0x00(%4,%3,4),%%xmm2          \n"
+      "movzb       0x1f(%0),%3                   \n"
+      "vmovd       0x00(%4,%3,4),%%xmm3          \n"
+      "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
+
+      "vmovdqu     (%0),%%ymm6                   \n"
+      "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
+      "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
+      "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
+      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psubb       %%xmm5,%%xmm0                 \n"
+      "psubb       %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "movdqu      %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm6                 \n"
+      "paddw       %%xmm5,%%xmm6                 \n"
+      "psrlw       $0x8,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "movdqu      0x10(%0),%%xmm3               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrld       $0x18,%%xmm2                  \n"
+      "psrld       $0x18,%%xmm3                  \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm6,%%xmm1                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kSub128)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa      %2,%%xmm2                     \n"
+      "movdqa      %3,%%xmm3                     \n"
+      "movdqa      %4,%%xmm4                     \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm6                 \n"
+      "phaddw      %%xmm6,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm5                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm5                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm4,%%xmm5                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm5                 \n"
+      "psrlw       $0x7,%%xmm5                   \n"
+      "packuswb    %%xmm5,%%xmm5                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrld       $0x18,%%xmm6                  \n"
+      "psrld       $0x18,%%xmm1                  \n"
+      "packuswb    %%xmm1,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "punpckhwd   %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%1                       \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width) {
+  asm volatile(
+      "movdqu      (%3),%%xmm5                   \n"
+      "pshufd      $0x00,%%xmm5,%%xmm2           \n"
+      "pshufd      $0x55,%%xmm5,%%xmm3           \n"
+      "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
+      "pshufd      $0xff,%%xmm5,%%xmm5           \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "pmaddubsw   %%xmm2,%%xmm7                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "pmaddubsw   %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm3,%%xmm1                 \n"
+      "phaddsw     %%xmm7,%%xmm0                 \n"
+      "phaddsw     %%xmm1,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm0                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm1                 \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x10(%0),%%xmm7               \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm7                 \n"
+      "phaddsw     %%xmm7,%%xmm6                 \n"
+      "psraw       $0x6,%%xmm1                   \n"
+      "psraw       $0x6,%%xmm6                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "punpcklbw   %%xmm6,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "punpcklwd   %%xmm1,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm6                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm6,0x10(%1)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "movd        %2,%%xmm2                     \n"
+      "movd        %3,%%xmm3                     \n"
+      "movd        %4,%%xmm4                     \n"
+      "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
+      "pshufd      $0x44,%%xmm2,%%xmm2           \n"
+      "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
+      "pshufd      $0x44,%%xmm3,%%xmm3           \n"
+      "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
+      "pshufd      $0x44,%%xmm4,%%xmm4           \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "pslld       $0x18,%%xmm6                  \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "punpckhbw   %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "pmullw      %%xmm3,%%xmm0                 \n"
+      "movdqu      (%0),%%xmm7                   \n"
+      "pmullw      %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm6,%%xmm7                 \n"
+      "paddw       %%xmm4,%%xmm0                 \n"
+      "paddw       %%xmm4,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x4,%1                       \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "movd        %3,%%xmm2                     \n"
+      "punpcklbw   %%xmm2,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm2                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm2,%%xmm1                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm2,%%xmm3                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm3,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     (%1),%%ymm3                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
+      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm1                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "psubusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "sub         %0,%3                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x2(%0),%%xmm1                \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "movq        0x02(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x00(%0,%2,1),%%xmm2          \n"
+      "movq        0x02(%0,%2,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%3,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%4                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "sub         %0,%2                         \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movq        0x00(%0,%1,1),%%xmm1          \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "psubw       %%xmm1,%%xmm0                 \n"
+      "movq        0x1(%0),%%xmm1                \n"
+      "movq        0x01(%0,%1,1),%%xmm2          \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "psubw       %%xmm2,%%xmm1                 \n"
+      "movq        0x2(%0),%%xmm2                \n"
+      "movq        0x02(%0,%1,1),%%xmm3          \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "psubw       %%xmm3,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm1,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "psubw       %%xmm0,%%xmm1                 \n"
+      "pmaxsw      %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,0x00(%0,%2,1)          \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm2                 \n"
+      "punpckhbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "punpcklwd   %%xmm2,%%xmm1                 \n"
+      "punpckhwd   %%xmm2,%%xmm2                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklwd   %%xmm0,%%xmm3                 \n"
+      "punpckhwd   %%xmm0,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm3                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm1,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "movdqu      %%xmm3,0x20(%2)               \n"
+      "movdqu      %%xmm0,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "paddusb     %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "punpcklbw   %%xmm5,%%xmm3                 \n"
+      "punpckhbw   %%xmm5,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm2,%%xmm4                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "punpcklwd   %%xmm3,%%xmm6                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "punpcklwd   %%xmm0,%%xmm7                 \n"
+      "punpckhwd   %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm6,(%2)                   \n"
+      "movdqu      %%xmm4,0x10(%2)               \n"
+      "movdqu      %%xmm7,0x20(%2)               \n"
+      "movdqu      %%xmm1,0x30(%2)               \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  asm volatile(
+      "pxor        %%xmm0,%%xmm0                 \n"
+      "pxor        %%xmm1,%%xmm1                 \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "test        $0xf,%1                       \n"
+      "jne         49f                           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm2                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "punpckhwd   %%xmm1,%%xmm3                 \n"
+      "punpckhbw   %%xmm1,%%xmm4                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "punpcklwd   %%xmm1,%%xmm4                 \n"
+      "punpckhwd   %%xmm1,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "movdqu      0x10(%2),%%xmm3               \n"
+      "paddd       %%xmm0,%%xmm3                 \n"
+      "paddd       %%xmm4,%%xmm0                 \n"
+      "movdqu      0x20(%2),%%xmm4               \n"
+      "paddd       %%xmm0,%%xmm4                 \n"
+      "paddd       %%xmm5,%%xmm0                 \n"
+      "movdqu      0x30(%2),%%xmm5               \n"
+      "lea         0x40(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm5                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "movdqu      %%xmm3,0x10(%1)               \n"
+      "movdqu      %%xmm4,0x20(%1)               \n"
+      "movdqu      %%xmm5,0x30(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd        (%0),%%xmm2                   \n"
+      "lea         0x4(%0),%0                    \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "punpcklwd   %%xmm1,%%xmm2                 \n"
+      "paddd       %%xmm2,%%xmm0                 \n"
+      "movdqu      (%2),%%xmm2                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm2,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
+
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count) {
+  asm volatile(
+      "movd        %5,%%xmm5                     \n"
+      "cvtdq2ps    %%xmm5,%%xmm5                 \n"
+      "rcpss       %%xmm5,%%xmm4                 \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "sub         $0x4,%3                       \n"
+      "jl          49f                           \n"
+      "cmpl        $0x80,%5                      \n"
+      "ja          40f                           \n"
+
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrld       $0x10,%%xmm6                  \n"
+      "cvtdq2ps    %%xmm6,%%xmm6                 \n"
+      "addps       %%xmm6,%%xmm5                 \n"
+      "mulps       %%xmm4,%%xmm5                 \n"
+      "cvtps2dq    %%xmm5,%%xmm5                 \n"
+      "packssdw    %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm0                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         4b                            \n"
+      "jmp         49f                           \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "psubd       0x10(%0,%4,4),%%xmm1          \n"
+      "psubd       0x20(%0,%4,4),%%xmm2          \n"
+      "psubd       0x30(%0,%4,4),%%xmm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "psubd       0x10(%1),%%xmm1               \n"
+      "psubd       0x20(%1),%%xmm2               \n"
+      "psubd       0x30(%1),%%xmm3               \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "paddd       0x10(%1,%4,4),%%xmm1          \n"
+      "paddd       0x20(%1,%4,4),%%xmm2          \n"
+      "paddd       0x30(%1,%4,4),%%xmm3          \n"
+      "lea         0x40(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm1,%%xmm1                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm1                 \n"
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "cvtps2dq    %%xmm1,%%xmm1                 \n"
+      "cvtps2dq    %%xmm2,%%xmm2                 \n"
+      "cvtps2dq    %%xmm3,%%xmm3                 \n"
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%3                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "psubd       0x00(%0,%4,4),%%xmm0          \n"
+      "lea         0x10(%0),%0                   \n"
+      "psubd       (%1),%%xmm0                   \n"
+      "paddd       0x00(%1,%4,4),%%xmm0          \n"
+      "lea         0x10(%1),%1                   \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "mulps       %%xmm4,%%xmm0                 \n"
+      "cvtps2dq    %%xmm0,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x4(%2),%2                    \n"
+      "sub         $0x1,%3                       \n"
+      "jge         10b                           \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp;
+  asm volatile(
+      "movq        (%3),%%xmm2                   \n"
+      "movq        0x08(%3),%%xmm7               \n"
+      "shl         $0x10,%1                      \n"
+      "add         $0x4,%1                       \n"
+      "movd        %1,%%xmm5                     \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
+
+      "pshufd      $0x44,%%xmm7,%%xmm7           \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm0                 \n"
+      "movlhps     %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm7,%%xmm4                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "addps       %%xmm4,%%xmm4                 \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
+      "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
+      "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
+      "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm1                 \n"
+      "addps       %%xmm4,%%xmm2                 \n"
+      "movq        %%xmm1,(%2)                   \n"
+      "movd        %%xmm0,%k1                    \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+      "movd        %%xmm0,%k5                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        0x00(%0,%5,1),%%xmm6          \n"
+      "punpckldq   %%xmm6,%%xmm0                 \n"
+      "addps       %%xmm4,%%xmm3                 \n"
+      "movq        %%xmm0,0x08(%2)               \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "add         $0x3,%4                       \n"
+      "jl          19f                           \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq   %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "pmaddwd     %%xmm5,%%xmm0                 \n"
+      "addps       %%xmm7,%%xmm2                 \n"
+      "movd        %%xmm0,%k1                    \n"
+      "movd        0x00(%0,%1,1),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "lea         0x04(%2),%2                   \n"
+      "sub         $0x1,%4                       \n"
+      "jge         10b                           \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int width,
+                          int source_y_fraction) {
+  asm volatile(
+      "sub         %1,%0                         \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "movd        %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "movd        %3,%%xmm5                     \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm4,%%xmm0                 \n"
+      "psubb       %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "pmaddubsw   %%xmm1,%%xmm3                 \n"
+      "paddw       %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(width),                // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction) {
+  asm volatile(
+      "sub         %1,%0                         \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "vmovd       %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "vmovd       %3,%%xmm5                     \n"
+      "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
+      "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov         $0x80808080,%%eax             \n"
+      "vmovd       %%eax,%%xmm4                  \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
+      "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "vmovdqu     (%1),%%ymm0                   \n"
+      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          100b                          \n"
+
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+r"(width),                 // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width) {
+  asm volatile(
+
+      "movdqu      (%3),%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "movdqu      %%xmm1,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,(%3)                   \n"
+      "movdqu      %%xmm2,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
+      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_AVX2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width) {
+  asm volatile(
+
+      "pxor        %%xmm3,%%xmm3                 \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm3,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "punpcklwd   %%xmm3,%%xmm0                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "mulps       0x10(%3),%%xmm0               \n"
+      "mulps       0x10(%3),%%xmm4               \n"
+      "addps       (%3),%%xmm0                   \n"
+      "addps       (%3),%%xmm4                   \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm1,%%xmm2                 \n"
+      "mulps       %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm2,%%xmm1                 \n"
+      "mulps       %%xmm6,%%xmm5                 \n"
+      "mulps       0x20(%3),%%xmm2               \n"
+      "mulps       0x20(%3),%%xmm6               \n"
+      "mulps       0x30(%3),%%xmm1               \n"
+      "mulps       0x30(%3),%%xmm5               \n"
+      "addps       %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm6,%%xmm4                 \n"
+      "addps       %%xmm1,%%xmm0                 \n"
+      "addps       %%xmm5,%%xmm4                 \n"
+      "cvttps2dq   %%xmm0,%%xmm0                 \n"
+      "cvttps2dq   %%xmm4,%%xmm4                 \n"
+      "packuswb    %%xmm4,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width) {
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "vbroadcastss %3, %%ymm4                   \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
+      "add         $0x20,%0                      \n"
+      "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
+      "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
+      "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
+      "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
+      "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)            // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss %3, %%ymm4                   \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)            // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width) {
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "movzb       -0x1(%0),%1                   \n"
+      "movzb       0x03(%3,%1,4),%1              \n"
+      "mov         %b1,-0x1(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
+  uintptr_t pixel_temp;
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb       (%0),%1                       \n"
+      "lea         0x4(%0),%0                    \n"
+      "movzb       0x00(%3,%1,4),%1              \n"
+      "mov         %b1,-0x4(%0)                  \n"
+      "movzb       -0x3(%0),%1                   \n"
+      "movzb       0x01(%3,%1,4),%1              \n"
+      "mov         %b1,-0x3(%0)                  \n"
+      "movzb       -0x2(%0),%1                   \n"
+      "movzb       0x02(%3,%1,4),%1              \n"
+      "mov         %b1,-0x2(%0)                  \n"
+      "dec         %2                            \n"
+      "jg          1b                            \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
+                                 int width,
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff) {
+  uintptr_t pixel_temp;
+  uintptr_t table_temp;
+  asm volatile(
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0x8,%%xmm4                   \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%2),%%xmm0                   \n"
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"
+      "phaddw      %%xmm0,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "punpcklwd   %%xmm5,%%xmm0                 \n"
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       (%2),%0                       \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,(%3)                      \n"
+      "movzb       0x1(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x1(%3)                   \n"
+      "movzb       0x2(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x2(%3)                   \n"
+      "movzb       0x3(%2),%0                    \n"
+      "mov         %b0,0x3(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x4(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x4(%3)                   \n"
+      "movzb       0x5(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x5(%3)                   \n"
+      "movzb       0x6(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x6(%3)                   \n"
+      "movzb       0x7(%2),%0                    \n"
+      "mov         %b0,0x7(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+      "pshufd      $0x39,%%xmm0,%%xmm0           \n"
+
+      "movzb       0x8(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x8(%3)                   \n"
+      "movzb       0x9(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0x9(%3)                   \n"
+      "movzb       0xa(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xa(%3)                   \n"
+      "movzb       0xb(%2),%0                    \n"
+      "mov         %b0,0xb(%3)                   \n"
+
+      "movd        %%xmm0,%k1                    \n"  // 32 bit offset
+      "add         %5,%1                         \n"
+
+      "movzb       0xc(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xc(%3)                   \n"
+      "movzb       0xd(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xd(%3)                   \n"
+      "movzb       0xe(%2),%0                    \n"
+      "movzb       0x00(%1,%0,1),%0              \n"
+      "mov         %b0,0xe(%3)                   \n"
+      "movzb       0xf(%2),%0                    \n"
+      "mov         %b0,0xf(%3)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "sub         $0x4,%4                       \n"
+      "jg          1b                            \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+static const uvec8 kYUV24Shuffle[3] = {
+    {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
+    {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
+    {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_yuv24,
+                          int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
+      "movdqa      16(%4),%%xmm5                 \n"
+      "movdqa      32(%4),%%xmm6                 \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // load 16 Y values
+      "movdqu      (%0,%1),%%xmm3                \n"  // load 8 VU values
+      "lea         16(%0),%0                     \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "shufps      $0x44,%%xmm3,%%xmm0           \n"  // Y 0..7,  UV 0..3
+      "shufps      $0x99,%%xmm3,%%xmm1           \n"  // Y 4..11, UV 2..5
+      "shufps      $0xee,%%xmm3,%%xmm2           \n"  // Y 8..15, UV 4..7
+      "pshufb      %%xmm4, %%xmm0                \n"  // weave into YUV24
+      "pshufb      %%xmm5, %%xmm1                \n"
+      "pshufb      %%xmm6, %%xmm2                \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm1,16(%2)                 \n"
+      "movdqu      %%xmm2,32(%2)                 \n"
+      "lea         48(%2),%2                     \n"
+      "sub         $16,%3                        \n"  // 16 pixels per loop
+      "jg          1b                            \n"
+      : "+r"(src_y),            // %0
+        "+r"(src_vu),           // %1
+        "+r"(dst_yuv24),        // %2
+        "+r"(width)             // %3
+      : "r"(&kYUV24Shuffle[0])  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
+      "vbroadcastf128 16(%4),%%ymm5              \n"
+      "vbroadcastf128 32(%4),%%ymm6              \n"
+
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
+      "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
+      "lea         32(%0),%0                     \n"
+      "vshufps     $0x44,%%ymm3,%%ymm2,%%ymm0    \n"  // Y 0..7,  UV 0..3
+      "vshufps     $0x99,%%ymm3,%%ymm2,%%ymm1    \n"  // Y 4..11, UV 2..5
+      "vshufps     $0xee,%%ymm3,%%ymm2,%%ymm2    \n"  // Y 8..15, UV 4..7
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"  // weave into YUV24
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
+      "vperm2i128  $0x20,%%ymm1,%%ymm0,%%ymm3    \n"
+      "vperm2i128  $0x30,%%ymm0,%%ymm2,%%ymm0    \n"
+      "vperm2i128  $0x31,%%ymm2,%%ymm1,%%ymm1    \n"
+      "vmovdqu     %%ymm3,(%2)                   \n"
+      "vmovdqu     %%ymm0,32(%2)                 \n"
+      "vmovdqu     %%ymm1,64(%2)                 \n"
+      "lea         96(%2),%2                     \n"
+      "sub         $32,%3                        \n"  // 32 pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),            // %0
+        "+r"(src_vu),           // %1
+        "+r"(dst_yuv24),        // %2
+        "+r"(width)             // %3
+      : "r"(&kYUV24Shuffle[0])  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_NV21ToYUV24ROW_AVX512
+// The following VMBI VEX256 code tests okay with the intelsde emulator.
+static const lvec8 kYUV24Perm[3] = {
+    {32, 33, 0,  32, 33, 1,  34, 35, 2,  34, 35, 3,  36, 37, 4,  36,
+     37, 5,  38, 39, 6,  38, 39, 7,  40, 41, 8,  40, 41, 9,  42, 43},
+    {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
+     48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
+    {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
+     26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
+
+void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
+                           const uint8_t* src_vu,
+                           uint8_t* dst_yuv24,
+                           int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
+      "vmovdqa     32(%4),%%ymm5                 \n"
+      "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
+      "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
+      "lea         32(%0),%0                     \n"
+      "vmovdqa     %%ymm2, %%ymm0                \n"
+      "vmovdqa     %%ymm2, %%ymm1                \n"
+      "vpermt2b    %%ymm3,%%ymm4,%%ymm0          \n"
+      "vpermt2b    %%ymm3,%%ymm5,%%ymm1          \n"
+      "vpermt2b    %%ymm3,%%ymm6,%%ymm2          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "vmovdqu     %%ymm1,32(%2)                 \n"
+      "vmovdqu     %%ymm2,64(%2)                 \n"
+      "lea         96(%2),%2                     \n"
+      "sub         $32,%3                        \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_vu),        // %1
+        "+r"(dst_yuv24),     // %2
+        "+r"(width)          // %3
+      : "r"(&kYUV24Perm[0])  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif  // HAS_NV21ToYUV24ROW_AVX512
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "movdqu      %3,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
+      "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
+      "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
+      "movdqu      0(%1,%5,1),%%xmm3             \n"
+      "lea         0x10(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "lea         0x10(%1),%1                   \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
+      "jg          1b                            \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
+      "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
+      "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
+      "vmovdqu     0(%1,%5,1),%%ymm3             \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+  asm volatile(
+      "pxor        %%xmm1,%%xmm1                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd        (%0),%%xmm0                   \n"  // load float
+      "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
+      "add         4, %0                         \n"
+      "movd        %%xmm0, (%1)                  \n"  // store float
+      "add         4, %1                         \n"
+      "sub         $0x4,%2                       \n"  // 1 float per loop
+      "jg          1b                            \n"
+      : "+r"(src_x),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
new file mode 100644
index 00000000..be85022e
--- /dev/null
+++ b/source/row_lasx.cc
@@ -0,0 +1,2304 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+  {                                                      \
+    ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]);      \
+    vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]);      \
+    ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]);      \
+    vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]);      \
+    yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]);     \
+    yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
+  }
+
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+  {                                                             \
+    __m256i temp0, temp1;                                       \
+                                                                \
+    DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
+    temp1 = __lasx_xvld(psrc_v, 0);                             \
+    temp0 = __lasx_xvsub_b(temp0, const_0x80);                  \
+    temp1 = __lasx_xvsub_b(temp1, const_0x80);                  \
+    temp0 = __lasx_vext2xv_h_b(temp0);                          \
+    temp1 = __lasx_vext2xv_h_b(temp1);                          \
+    uv_l = __lasx_xvilvl_h(temp0, temp1);                       \
+    uv_h = __lasx_xvilvh_h(temp0, temp1);                       \
+  }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+  {                                                   \
+    __m256i temp0, temp1;                             \
+                                                      \
+    out_y = __lasx_xvld(psrc_y, 0);                   \
+    temp0 = __lasx_xvldrepl_d(psrc_u, 0);             \
+    temp1 = __lasx_xvldrepl_d(psrc_v, 0);             \
+    uv = __lasx_xvilvl_b(temp0, temp1);               \
+    uv = __lasx_xvsub_b(uv, const_0x80);              \
+    uv = __lasx_vext2xv_h_b(uv);                      \
+  }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l,   \
+                   g_h, r_l, r_h)                                             \
+  {                                                                           \
+    __m256i u_l, u_h, v_l, v_h;                                               \
+    __m256i yl_ev, yl_od, yh_ev, yh_od;                                       \
+    __m256i temp0, temp1, temp2, temp3;                                       \
+                                                                              \
+    temp0 = __lasx_xvilvl_b(in_y, in_y);                                      \
+    temp1 = __lasx_xvilvh_b(in_y, in_y);                                      \
+    yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg);                                \
+    yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg);                                \
+    yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg);                                \
+    yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg);                                \
+    DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16,    \
+              yl_ev, yl_od, yh_ev, yh_od);                                    \
+    yl_ev = __lasx_xvadd_w(yl_ev, yb);                                        \
+    yl_od = __lasx_xvadd_w(yl_od, yb);                                        \
+    yh_ev = __lasx_xvadd_w(yh_ev, yb);                                        \
+    yh_od = __lasx_xvadd_w(yh_od, yb);                                        \
+    v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr);                                  \
+    u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr);                                  \
+    v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr);                                  \
+    u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr);                                  \
+    temp0 = __lasx_xvadd_w(yl_ev, u_l);                                       \
+    temp1 = __lasx_xvadd_w(yl_od, u_l);                                       \
+    temp2 = __lasx_xvadd_w(yh_ev, u_h);                                       \
+    temp3 = __lasx_xvadd_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                           \
+    DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                  \
+    b_l = __lasx_xvpackev_h(temp1, temp0);                                    \
+    b_h = __lasx_xvpackev_h(temp3, temp2);                                    \
+    temp0 = __lasx_xvadd_w(yl_ev, v_l);                                       \
+    temp1 = __lasx_xvadd_w(yl_od, v_l);                                       \
+    temp2 = __lasx_xvadd_w(yh_ev, v_h);                                       \
+    temp3 = __lasx_xvadd_w(yh_od, v_h);                                       \
+    DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                           \
+    DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                  \
+    r_l = __lasx_xvpackev_h(temp1, temp0);                                    \
+    r_h = __lasx_xvpackev_h(temp3, temp2);                                    \
+    DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h);        \
+    temp0 = __lasx_xvsub_w(yl_ev, u_l);                                       \
+    temp1 = __lasx_xvsub_w(yl_od, u_l);                                       \
+    temp2 = __lasx_xvsub_w(yh_ev, u_h);                                       \
+    temp3 = __lasx_xvsub_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                           \
+    DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                  \
+    g_l = __lasx_xvpackev_h(temp1, temp0);                                    \
+    g_h = __lasx_xvpackev_h(temp3, temp2);                                    \
+  }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+  {                                                                    \
+    __m256i u_l, v_l, yl_ev, yl_od;                                    \
+    __m256i temp0, temp1;                                              \
+                                                                       \
+    in_y = __lasx_xvpermi_d(in_y, 0xD8);                               \
+    temp0 = __lasx_xvilvl_b(in_y, in_y);                               \
+    yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg);                         \
+    yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg);                         \
+    DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od);    \
+    yl_ev = __lasx_xvadd_w(yl_ev, yb);                                 \
+    yl_od = __lasx_xvadd_w(yl_od, yb);                                 \
+    v_l = __lasx_xvmulwev_w_h(in_uv, ubvr);                            \
+    u_l = __lasx_xvmulwod_w_h(in_uv, ubvr);                            \
+    temp0 = __lasx_xvadd_w(yl_ev, u_l);                                \
+    temp1 = __lasx_xvadd_w(yl_od, u_l);                                \
+    DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1);      \
+    DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);         \
+    out_b = __lasx_xvpackev_h(temp1, temp0);                           \
+    temp0 = __lasx_xvadd_w(yl_ev, v_l);                                \
+    temp1 = __lasx_xvadd_w(yl_od, v_l);                                \
+    DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1);      \
+    DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);         \
+    out_r = __lasx_xvpackev_h(temp1, temp0);                           \
+    u_l = __lasx_xvdp2_w_h(in_uv, ugvg);                               \
+    temp0 = __lasx_xvsub_w(yl_ev, u_l);                                \
+    temp1 = __lasx_xvsub_w(yl_od, u_l);                                \
+    DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1);      \
+    DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);         \
+    out_g = __lasx_xvpackev_h(temp1, temp0);                           \
+  }
+
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+  {                                                                    \
+    __m256i temp0, temp1, temp2, temp3;                                \
+                                                                       \
+    temp0 = __lasx_xvpackev_b(g_l, b_l);                               \
+    temp1 = __lasx_xvpackev_b(a_l, r_l);                               \
+    temp2 = __lasx_xvpackev_b(g_h, b_h);                               \
+    temp3 = __lasx_xvpackev_b(a_h, r_h);                               \
+    r_l = __lasx_xvilvl_h(temp1, temp0);                               \
+    r_h = __lasx_xvilvh_h(temp1, temp0);                               \
+    g_l = __lasx_xvilvl_h(temp3, temp2);                               \
+    g_h = __lasx_xvilvh_h(temp3, temp2);                               \
+    temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20);                          \
+    temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20);                          \
+    temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31);                          \
+    temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31);                          \
+    __lasx_xvst(temp0, pdst_argb, 0);                                  \
+    __lasx_xvst(temp1, pdst_argb, 32);                                 \
+    __lasx_xvst(temp2, pdst_argb, 64);                                 \
+    __lasx_xvst(temp3, pdst_argb, 96);                                 \
+    pdst_argb += 128;                                                  \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+  {                                                  \
+    __m256i temp0, temp1, temp2, temp3;              \
+                                                     \
+    temp0 = __lasx_xvpackev_b(in_g, in_b);           \
+    temp1 = __lasx_xvpackev_b(in_a, in_r);           \
+    temp2 = __lasx_xvilvl_h(temp1, temp0);           \
+    temp3 = __lasx_xvilvh_h(temp1, temp0);           \
+    temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);    \
+    temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);    \
+    __lasx_xvst(temp0, pdst_argb, 0);                \
+    __lasx_xvst(temp1, pdst_argb, 32);               \
+    pdst_argb += 64;                                 \
+  }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
+  {                                                                     \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                 \
+    _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb);                         \
+    _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb);                         \
+    _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg);                         \
+    _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg);                         \
+    _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr);                         \
+    _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr);                         \
+    _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1);                             \
+    _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3);                             \
+    _tmpr = __lasx_xvavgr_hu(_reg0, _reg1);                             \
+    _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb);              \
+    _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr);              \
+    _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg);                    \
+    _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg);                    \
+    _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr);                    \
+    _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb);                    \
+  }
+
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 64;
+  __m256i src0, src1;
+  __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
+                      0x08090A0B0C0D0E0F, 0x0001020304050607};
+  src += width - 64;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    src0 = __lasx_xvpermi_q(src0, src0, 0x01);
+    src1 = __lasx_xvpermi_q(src1, src1, 0x01);
+    __lasx_xvst(src1, dst, 0);
+    __lasx_xvst(src0, dst, 32);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  int len = width / 16;
+  __m256i src, dst;
+  __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
+                      0x0004000500060007, 0x0000000100020003};
+
+  src_uv += (width - 16) << 1;
+  for (x = 0; x < len; x++) {
+    src = __lasx_xvld(src_uv, 0);
+    dst = __lasx_xvshuf_h(shuffler, src, src);
+    dst = __lasx_xvpermi_q(dst, dst, 0x01);
+    __lasx_xvst(dst, dst_uv, 0);
+    src_uv -= 32;
+    dst_uv += 32;
+  }
+}
+
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1;
+  __m256i dst0, dst1;
+  __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
+                      0x0B0A09080F0E0D0C, 0x0302010007060504};
+  src += (width * 4) - 64;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
+    dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
+    __lasx_xvst(dst0, dst, 0);
+    __lasx_xvst(dst1, dst, 32);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  int x;
+  int len = width / 32;
+  __m256i src_u0, src_v0, src_y0, vec_uv0;
+  __m256i vec_yuy2_0, vec_yuy2_1;
+  __m256i dst_yuy2_0, dst_yuy2_1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lasx_xvld(src_y, 0);
+    src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+    src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+    vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+    vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
+    vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
+    dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
+    dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
+    __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
+    __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  int x;
+  int len = width / 32;
+  __m256i src_u0, src_v0, src_y0, vec_uv0;
+  __m256i vec_uyvy0, vec_uyvy1;
+  __m256i dst_uyvy0, dst_uyvy1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lasx_xvld(src_y, 0);
+    src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+    src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+    vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+    vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
+    vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
+    dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
+    dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
+    __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
+    __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
+void I422ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i alpha = __lasx_xvldi(0xFF);
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+void I422ToRGBARow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i alpha = __lasx_xvldi(0xFF);
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  int x;
+  int len = width / 32;
+  int res = width & 31;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i zero = __lasx_xvldi(0);
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+    y = __lasx_xvld(src_a, 0);
+    a_l = __lasx_xvilvl_b(zero, y);
+    a_h = __lasx_xvilvh_b(zero, y);
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+    src_a += 32;
+  }
+  if (res) {
+    __m256i y, uv, r, g, b, a;
+    a = __lasx_xvld(src_a, 0);
+    a = __lasx_vext2xv_hu_bu(a);
+    READYUV422(src_y, src_u, src_v, y, uv);
+    YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+    STOREARGB(a, r, g, b, dst_argb);
+  }
+}
+
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int32_t width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+  __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
+                       0x0504120302100100, 0x0A18090816070614};
+  __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
+                       0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m256i temp0, temp1, temp2, temp3;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    temp0 = __lasx_xvpackev_b(g_l, b_l);
+    temp1 = __lasx_xvpackev_b(g_h, b_h);
+    DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
+              r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+              temp1);
+
+    b_l = __lasx_xvilvl_d(temp1, temp2);
+    b_h = __lasx_xvilvh_d(temp3, temp1);
+    temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
+    temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
+    temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
+    __lasx_xvst(temp1, dst_argb, 0);
+    __lasx_xvst(temp2, dst_argb, 32);
+    __lasx_xvst(temp3, dst_argb, 64);
+    dst_argb += 96;
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m256i dst_l, dst_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lasx_xvsrli_h(b_l, 3);
+    b_h = __lasx_xvsrli_h(b_h, 3);
+    g_l = __lasx_xvsrli_h(g_l, 2);
+    g_h = __lasx_xvsrli_h(g_h, 2);
+    r_l = __lasx_xvsrli_h(r_l, 3);
+    r_h = __lasx_xvsrli_h(r_h, 3);
+    r_l = __lasx_xvslli_h(r_l, 11);
+    r_h = __lasx_xvslli_h(r_h, 11);
+    g_l = __lasx_xvslli_h(g_l, 5);
+    g_h = __lasx_xvslli_h(g_h, 5);
+    r_l = __lasx_xvor_v(r_l, g_l);
+    r_l = __lasx_xvor_v(r_l, b_l);
+    r_h = __lasx_xvor_v(r_h, g_h);
+    r_h = __lasx_xvor_v(r_h, b_h);
+    dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+    dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+    __lasx_xvst(dst_l, dst_rgb565, 0);
+    __lasx_xvst(dst_h, dst_rgb565, 32);
+    dst_rgb565 += 64;
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+  __m256i alpha = (__m256i)v4u64{0xF000F000F000F000, 0xF000F000F000F000,
+                                 0xF000F000F000F000, 0xF000F000F000F000};
+  __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
+                  0x00F000F000F000F0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m256i dst_l, dst_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lasx_xvsrli_h(b_l, 4);
+    b_h = __lasx_xvsrli_h(b_h, 4);
+    r_l = __lasx_xvsrli_h(r_l, 4);
+    r_h = __lasx_xvsrli_h(r_h, 4);
+    g_l = __lasx_xvand_v(g_l, mask);
+    g_h = __lasx_xvand_v(g_h, mask);
+    r_l = __lasx_xvslli_h(r_l, 8);
+    r_h = __lasx_xvslli_h(r_h, 8);
+    r_l = __lasx_xvor_v(r_l, alpha);
+    r_h = __lasx_xvor_v(r_h, alpha);
+    r_l = __lasx_xvor_v(r_l, g_l);
+    r_h = __lasx_xvor_v(r_h, g_h);
+    r_l = __lasx_xvor_v(r_l, b_l);
+    r_h = __lasx_xvor_v(r_h, b_h);
+    dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+    dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+    __lasx_xvst(dst_l, dst_argb4444, 0);
+    __lasx_xvst(dst_h, dst_argb4444, 32);
+    dst_argb4444 += 64;
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int len = width / 32;
+  __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+  __m256i alpha = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
+                                 0x8000800080008000, 0x8000800080008000};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m256i dst_l, dst_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lasx_xvsrli_h(b_l, 3);
+    b_h = __lasx_xvsrli_h(b_h, 3);
+    g_l = __lasx_xvsrli_h(g_l, 3);
+    g_h = __lasx_xvsrli_h(g_h, 3);
+    g_l = __lasx_xvslli_h(g_l, 5);
+    g_h = __lasx_xvslli_h(g_h, 5);
+    r_l = __lasx_xvsrli_h(r_l, 3);
+    r_h = __lasx_xvsrli_h(r_h, 3);
+    r_l = __lasx_xvslli_h(r_l, 10);
+    r_h = __lasx_xvslli_h(r_h, 10);
+    r_l = __lasx_xvor_v(r_l, alpha);
+    r_h = __lasx_xvor_v(r_h, alpha);
+    r_l = __lasx_xvor_v(r_l, g_l);
+    r_h = __lasx_xvor_v(r_h, g_h);
+    r_l = __lasx_xvor_v(r_l, b_l);
+    r_h = __lasx_xvor_v(r_h, b_h);
+    dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+    dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+    __lasx_xvst(dst_l, dst_argb1555, 0);
+    __lasx_xvst(dst_h, dst_argb1555, 32);
+    dst_argb1555 += 64;
+    src_y += 32;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+    dst0 = __lasx_xvpickev_b(src1, src0);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_y, 0);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+                      int src_stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
+              src_yuy2_next, 32, src0, src1, src2, src3);
+    src0 = __lasx_xvpickod_b(src1, src0);
+    src1 = __lasx_xvpickod_b(src3, src2);
+    tmp0 = __lasx_xvavgr_bu(src1, src0);
+    tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+    dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+    dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+    __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+    tmp0 = __lasx_xvpickod_b(src1, src0);
+    tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+    dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+    dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+    __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+    dst0 = __lasx_xvpickod_b(src1, src0);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_y, 0);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+                      int src_stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
+              src_uyvy_next, 32, src0, src1, src2, src3);
+    src0 = __lasx_xvpickev_b(src1, src0);
+    src1 = __lasx_xvpickev_b(src3, src2);
+    tmp0 = __lasx_xvavgr_bu(src1, src0);
+    tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+    dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+    dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+    __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+    dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+    dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+    __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToUVRow_LASX(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  int len = width / 32;
+  const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m256i vec0, vec1, vec2, vec3;
+  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+  __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
+                        0x0038003800380038, 0x0038003800380038};
+  __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
+                        0x0025002500250025, 0x0025002500250025};
+  __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
+                        0x0013001300130013, 0x0013001300130013};
+  __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
+                        0x002f002f002f002f, 0x002f002f002f002f};
+  __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
+                        0x0009000900090009, 0x0009000900090009};
+  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+                     0x0000000700000003};
+  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                        0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+              src_argb0, 96, src0, src1, src2, src3);
+    DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
+              src_argb1, 96, src4, src5, src6, src7);
+    vec0 = __lasx_xvaddwev_h_bu(src0, src4);
+    vec1 = __lasx_xvaddwev_h_bu(src1, src5);
+    vec2 = __lasx_xvaddwev_h_bu(src2, src6);
+    vec3 = __lasx_xvaddwev_h_bu(src3, src7);
+    tmp0 = __lasx_xvpickev_h(vec1, vec0);
+    tmp1 = __lasx_xvpickev_h(vec3, vec2);
+    tmp2 = __lasx_xvpickod_h(vec1, vec0);
+    tmp3 = __lasx_xvpickod_h(vec3, vec2);
+    vec0 = __lasx_xvaddwod_h_bu(src0, src4);
+    vec1 = __lasx_xvaddwod_h_bu(src1, src5);
+    vec2 = __lasx_xvaddwod_h_bu(src2, src6);
+    vec3 = __lasx_xvaddwod_h_bu(src3, src7);
+    tmp4 = __lasx_xvpickev_h(vec1, vec0);
+    tmp5 = __lasx_xvpickev_h(vec3, vec2);
+    vec0 = __lasx_xvpickev_h(tmp1, tmp0);
+    vec1 = __lasx_xvpickod_h(tmp1, tmp0);
+    src0 = __lasx_xvavgr_h(vec0, vec1);
+    vec0 = __lasx_xvpickev_h(tmp3, tmp2);
+    vec1 = __lasx_xvpickod_h(tmp3, tmp2);
+    src1 = __lasx_xvavgr_h(vec0, vec1);
+    vec0 = __lasx_xvpickev_h(tmp5, tmp4);
+    vec1 = __lasx_xvpickod_h(tmp5, tmp4);
+    src2 = __lasx_xvavgr_h(vec0, vec1);
+    dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
+    dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
+    dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
+    dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
+    dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
+    dst0 = __lasx_xvperm_w(dst0, control);
+    dst1 = __lasx_xvperm_w(dst1, control);
+    dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
+    dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+    __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+    src_argb0 += 128;
+    src_argb1 += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 32) - 1;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100,
+                  0x000000000E0D0C0A};
+  __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+                     0x0000000700000003};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+              96, src0, src1, src2, src3);
+    tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+    tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+    tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+    tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+    tmp0 = __lasx_xvperm_w(tmp0, control);
+    tmp1 = __lasx_xvperm_w(tmp1, control);
+    tmp2 = __lasx_xvperm_w(tmp2, control);
+    tmp3 = __lasx_xvperm_w(tmp3, control);
+    __lasx_xvst(tmp0, dst_rgb, 0);
+    __lasx_xvst(tmp1, dst_rgb, 24);
+    __lasx_xvst(tmp2, dst_rgb, 48);
+    __lasx_xvst(tmp3, dst_rgb, 72);
+    dst_rgb += 96;
+    src_argb += 128;
+  }
+  DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+            src0, src1, src2, src3);
+  tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+  tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+  tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+  tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+  tmp0 = __lasx_xvperm_w(tmp0, control);
+  tmp1 = __lasx_xvperm_w(tmp1, control);
+  tmp2 = __lasx_xvperm_w(tmp2, control);
+  tmp3 = __lasx_xvperm_w(tmp3, control);
+  __lasx_xvst(tmp0, dst_rgb, 0);
+  __lasx_xvst(tmp1, dst_rgb, 24);
+  __lasx_xvst(tmp2, dst_rgb, 48);
+  dst_rgb += 72;
+  __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+  __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+  __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 32) - 1;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102,
+                  0x000000000C0D0E08};
+  __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+                     0x0000000700000003};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+              96, src0, src1, src2, src3);
+    tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+    tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+    tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+    tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+    tmp0 = __lasx_xvperm_w(tmp0, control);
+    tmp1 = __lasx_xvperm_w(tmp1, control);
+    tmp2 = __lasx_xvperm_w(tmp2, control);
+    tmp3 = __lasx_xvperm_w(tmp3, control);
+    __lasx_xvst(tmp0, dst_rgb, 0);
+    __lasx_xvst(tmp1, dst_rgb, 24);
+    __lasx_xvst(tmp2, dst_rgb, 48);
+    __lasx_xvst(tmp3, dst_rgb, 72);
+    dst_rgb += 96;
+    src_argb += 128;
+  }
+  DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+            src0, src1, src2, src3);
+  tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+  tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+  tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+  tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+  tmp0 = __lasx_xvperm_w(tmp0, control);
+  tmp1 = __lasx_xvperm_w(tmp1, control);
+  tmp2 = __lasx_xvperm_w(tmp2, control);
+  tmp3 = __lasx_xvperm_w(tmp3, control);
+  __lasx_xvst(tmp0, dst_rgb, 0);
+  __lasx_xvst(tmp1, dst_rgb, 24);
+  __lasx_xvst(tmp2, dst_rgb, 48);
+  dst_rgb += 72;
+  __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+  __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+  __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb,
+                          uint8_t* dst_rgb,
+                          int width) {
+  int x;
+  int len = width / 16;
+  __m256i zero = __lasx_xvldi(0);
+  __m256i src0, src1, tmp0, tmp1, dst0;
+  __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300,
+                   0x0300030003000300};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmp0 = __lasx_xvsrli_b(tmp0, 3);
+    tmp1 = __lasx_xvpackev_b(zero, tmp1);
+    tmp1 = __lasx_xvsrli_h(tmp1, 2);
+    tmp0 = __lasx_xvsll_b(tmp0, shift);
+    tmp1 = __lasx_xvslli_h(tmp1, 5);
+    dst0 = __lasx_xvor_v(tmp0, tmp1);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_rgb, 0);
+    dst_rgb += 32;
+    src_argb += 64;
+  }
+}
+
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width) {
+  int x;
+  int len = width / 16;
+  __m256i zero = __lasx_xvldi(0);
+  __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+  __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703,
+                    0x0703070307030703};
+  __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200,
+                    0x0200020002000200};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmp0 = __lasx_xvsrli_b(tmp0, 3);
+    tmp1 = __lasx_xvsrl_b(tmp1, shift1);
+    tmp0 = __lasx_xvsll_b(tmp0, shift2);
+    tmp2 = __lasx_xvpackev_b(zero, tmp1);
+    tmp3 = __lasx_xvpackod_b(zero, tmp1);
+    tmp2 = __lasx_xvslli_h(tmp2, 5);
+    tmp3 = __lasx_xvslli_h(tmp3, 15);
+    dst0 = __lasx_xvor_v(tmp0, tmp2);
+    dst0 = __lasx_xvor_v(dst0, tmp3);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_rgb, 0);
+    dst_rgb += 32;
+    src_argb += 64;
+  }
+}
+
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
+    tmp0 = __lasx_xvsrli_b(tmp0, 4);
+    dst0 = __lasx_xvor_v(tmp1, tmp0);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_rgb, 0);
+    dst_rgb += 32;
+    src_argb += 64;
+  }
+}
+
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int32_t width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i reg0, reg1, reg2, reg3, dst0, dst1;
+  __m256i const_112 = __lasx_xvldi(112);
+  __m256i const_74 = __lasx_xvldi(74);
+  __m256i const_38 = __lasx_xvldi(38);
+  __m256i const_94 = __lasx_xvldi(94);
+  __m256i const_18 = __lasx_xvldi(18);
+  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                        0x8080808080808080, 0x8080808080808080};
+  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+                     0x0000000700000003};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+              96, src0, src1, src2, src3);
+    tmp0 = __lasx_xvpickev_h(src1, src0);
+    tmp1 = __lasx_xvpickod_h(src1, src0);
+    tmp2 = __lasx_xvpickev_h(src3, src2);
+    tmp3 = __lasx_xvpickod_h(src3, src2);
+    reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
+    reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
+    reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
+    reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
+    reg0 = __lasx_xvsub_h(reg0, reg2);
+    reg1 = __lasx_xvsub_h(reg1, reg3);
+    dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+    dst0 = __lasx_xvperm_w(dst0, control);
+    reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
+    reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
+    reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
+    reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
+    reg0 = __lasx_xvsub_h(reg0, reg2);
+    reg1 = __lasx_xvsub_h(reg1, reg3);
+    dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+    dst1 = __lasx_xvperm_w(dst1, control);
+    __lasx_xvst(dst0, dst_u, 0);
+    __lasx_xvst(dst1, dst_v, 0);
+    dst_u += 32;
+    dst_v += 32;
+    src_argb += 128;
+  }
+}
+
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 8;
+  __m256i zero = __lasx_xvldi(0);
+  __m256i src0, src1, dst0, dst1;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+    tmp0 = __lasx_xvilvl_b(src0, src0);
+    tmp1 = __lasx_xvilvh_b(src0, src0);
+    tmp2 = __lasx_xvilvl_b(zero, src1);
+    tmp3 = __lasx_xvilvh_b(zero, src1);
+    dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
+    dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
+    dst0 = __lasx_xvpickev_b(dst1, dst0);
+    __lasx_xvst(dst0, dst_argb, 0);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  int x;
+  int len = width / 8;
+  __m256i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lasx_xvsadd_bu(src0, src1);
+    __lasx_xvst(dst0, dst_argb, 0);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 8;
+  __m256i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lasx_xvssub_bu(src0, src1);
+    __lasx_xvst(dst0, dst_argb, 0);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, tmp0, tmp1;
+  __m256i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m256i b, g, r, a, dst0, dst1;
+  __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
+                     0x0007000300060002};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    b = __lasx_xvpackev_b(tmp0, tmp0);
+    r = __lasx_xvpackod_b(tmp0, tmp0);
+    g = __lasx_xvpackev_b(tmp1, tmp1);
+    a = __lasx_xvpackod_b(tmp1, tmp1);
+    reg0 = __lasx_xvmulwev_w_hu(b, a);
+    reg1 = __lasx_xvmulwod_w_hu(b, a);
+    reg2 = __lasx_xvmulwev_w_hu(r, a);
+    reg3 = __lasx_xvmulwod_w_hu(r, a);
+    reg4 = __lasx_xvmulwev_w_hu(g, a);
+    reg5 = __lasx_xvmulwod_w_hu(g, a);
+    reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
+    reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
+    reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
+    reg0 = __lasx_xvshuf_h(control, reg0, reg0);
+    reg2 = __lasx_xvshuf_h(control, reg2, reg2);
+    reg4 = __lasx_xvshuf_h(control, reg4, reg4);
+    tmp0 = __lasx_xvpackev_b(reg4, reg0);
+    tmp1 = __lasx_xvpackev_b(a, reg2);
+    dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+    dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    dst_argb += 64;
+    src_argb += 64;
+  }
+}
+
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                uint32_t dither4,
+                                int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, tmp0, tmp1, dst0;
+  __m256i b, g, r;
+  __m256i zero = __lasx_xvldi(0);
+  __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
+
+  vec_dither = __lasx_xvilvl_b(zero, vec_dither);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    b = __lasx_xvpackev_b(zero, tmp0);
+    r = __lasx_xvpackod_b(zero, tmp0);
+    g = __lasx_xvpackev_b(zero, tmp1);
+    b = __lasx_xvadd_h(b, vec_dither);
+    g = __lasx_xvadd_h(g, vec_dither);
+    r = __lasx_xvadd_h(r, vec_dither);
+    DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
+    r = __lasx_xvclip255_h(r);
+    b = __lasx_xvsrai_h(b, 3);
+    g = __lasx_xvsrai_h(g, 2);
+    r = __lasx_xvsrai_h(r, 3);
+    g = __lasx_xvslli_h(g, 5);
+    r = __lasx_xvslli_h(r, 11);
+    dst0 = __lasx_xvor_v(b, g);
+    dst0 = __lasx_xvor_v(dst0, r);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_rgb, 0);
+    src_argb += 64;
+    dst_rgb += 32;
+  }
+}
+
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, dst0, dst1;
+  __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000,
+                  0x0C0C0C0C08080808};
+  __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
+
+  shuf = __lasx_xvadd_b(shuf, temp);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    dst0 = __lasx_xvshuf_b(src0, src0, shuf);
+    dst1 = __lasx_xvshuf_b(src1, src1, shuf);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    src_argb += 64;
+    dst_argb += 64;
+  }
+}
+
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  int x;
+  int len = width / 8;
+  __m256i src0, dst0, tmp0, tmp1;
+  __m256i vec_value = __lasx_xvreplgr2vr_w(value);
+
+  vec_value = __lasx_xvilvl_b(vec_value, vec_value);
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_argb, 0);
+    tmp0 = __lasx_xvilvl_b(src0, src0);
+    tmp1 = __lasx_xvilvh_b(src0, src0);
+    tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
+    tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
+    dst0 = __lasx_xvpickod_b(tmp1, tmp0);
+    __lasx_xvst(dst0, dst_argb, 0);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, tmp0, tmp1;
+  __m256i reg0, reg1, reg2, dst0, dst1;
+  __m256i const_128 = __lasx_xvldi(0x480);
+  __m256i const_150 = __lasx_xvldi(0x96);
+  __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
+                      0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
+    reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
+    reg2 = __lasx_xvadd_h(reg0, reg1);
+    tmp0 = __lasx_xvpackod_b(reg2, reg2);
+    tmp1 = __lasx_xvpackod_b(tmp1, reg2);
+    dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+    dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    src_argb += 64;
+    dst_argb += 64;
+  }
+}
+
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 16;
+  __m256i src0, src1, tmp0, tmp1;
+  __m256i reg0, reg1, spb, spg, spr;
+  __m256i dst0, dst1;
+  __m256i spb_g = __lasx_xvldi(68);
+  __m256i spg_g = __lasx_xvldi(88);
+  __m256i spr_g = __lasx_xvldi(98);
+  __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311,
+                    0x2311231123112311};
+  __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16,
+                    0x2D162D162D162D16};
+  __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218,
+                    0x3218321832183218};
+  __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100,
+                   0x1F0E1D0C1B0A1908};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+    spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
+    spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
+    spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
+    spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
+    spb = __lasx_xvsrli_h(spb, 7);
+    spg = __lasx_xvsrli_h(spg, 7);
+    spr = __lasx_xvsrli_h(spr, 7);
+    spg = __lasx_xvsat_hu(spg, 7);
+    spr = __lasx_xvsat_hu(spr, 7);
+    reg0 = __lasx_xvpackev_b(spg, spb);
+    reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
+    dst0 = __lasx_xvilvl_h(reg1, reg0);
+    dst1 = __lasx_xvilvh_h(reg1, reg0);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    dst_argb += 64;
+  }
+}
+
+void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i reg0, reg1, reg2, reg3;
+  __m256i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_argb4444, 0);
+    src1 = __lasx_xvld(src_argb4444, 32);
+    DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2);
+    DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3);
+    DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2);
+    DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3);
+    DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2,
+              0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    __lasx_xvst(dst2, dst_argb, 64);
+    __lasx_xvst(dst3, dst_argb, 96);
+    src_argb4444 += 64;
+    dst_argb += 128;
+  }
+}
+
+void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1;
+  __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+  __m256i reg0, reg1, reg2, reg3;
+  __m256i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_argb1555, 0);
+    src1 = __lasx_xvld(src_argb1555, 32);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    tmpg = __lasx_xvsrli_b(tmp0, 5);
+    reg0 = __lasx_xvandi_b(tmp1, 0x03);
+    reg0 = __lasx_xvslli_b(reg0, 3);
+    tmpg = __lasx_xvor_v(tmpg, reg0);
+    reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+    tmpr = __lasx_xvsrli_b(reg1, 2);
+    tmpa = __lasx_xvsrli_b(tmp1, 7);
+    tmpa = __lasx_xvneg_b(tmpa);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvslli_b(tmpg, 3);
+    reg2 = __lasx_xvslli_b(tmpr, 3);
+    tmpb = __lasx_xvsrli_b(tmpb, 2);
+    tmpg = __lasx_xvsrli_b(tmpg, 2);
+    tmpr = __lasx_xvsrli_b(tmpr, 2);
+    tmpb = __lasx_xvor_v(reg0, tmpb);
+    tmpg = __lasx_xvor_v(reg1, tmpg);
+    tmpr = __lasx_xvor_v(reg2, tmpr);
+    DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+    DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3);
+    dst0 = __lasx_xvilvl_h(reg1, reg0);
+    dst1 = __lasx_xvilvh_h(reg1, reg0);
+    dst2 = __lasx_xvilvl_h(reg3, reg2);
+    dst3 = __lasx_xvilvh_h(reg3, reg2);
+    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+              0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+    __lasx_xvst(reg0, dst_argb, 0);
+    __lasx_xvst(reg1, dst_argb, 32);
+    __lasx_xvst(reg2, dst_argb, 64);
+    __lasx_xvst(reg3, dst_argb, 96);
+    src_argb1555 += 64;
+    dst_argb += 128;
+  }
+}
+
+void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1;
+  __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3;
+  __m256i alpha = __lasx_xvldi(0xFF);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_rgb565, 0);
+    src1 = __lasx_xvld(src_rgb565, 32);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+    reg1 = __lasx_xvandi_b(tmp1, 0x07);
+    reg0 = __lasx_xvsrli_b(tmp0, 5);
+    reg1 = __lasx_xvslli_b(reg1, 3);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvsrli_b(tmpb, 2);
+    tmpb = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvslli_b(tmpg, 2);
+    reg1 = __lasx_xvsrli_b(tmpg, 4);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvsrli_b(tmpr, 5);
+    tmpr = __lasx_xvor_v(tmpr, reg0);
+    DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+    dst0 = __lasx_xvilvl_h(reg1, reg0);
+    dst1 = __lasx_xvilvh_h(reg1, reg0);
+    DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+    dst2 = __lasx_xvilvl_h(reg1, reg0);
+    dst3 = __lasx_xvilvh_h(reg1, reg0);
+    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+              0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+    __lasx_xvst(reg0, dst_argb, 0);
+    __lasx_xvst(reg1, dst_argb, 32);
+    __lasx_xvst(reg2, dst_argb, 64);
+    __lasx_xvst(reg3, dst_argb, 96);
+    src_rgb565 += 64;
+    dst_argb += 128;
+  }
+}
+
+void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, src2;
+  __m256i tmp0, tmp1, tmp2;
+  __m256i dst0, dst1, dst2, dst3;
+  __m256i reg0, reg1, reg2, reg3;
+  __m256i alpha = __lasx_xvldi(0xFF);
+  __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+                   0x1B1A191817161514};
+  __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+                   0x0706050403020100};
+  __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+                   0x131211100F0E0D0C};
+  __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
+                   0x100B0A0910080706};
+
+  for (x = 0; x < len; x++) {
+    reg0 = __lasx_xvld(src_rgb24, 0);
+    reg1 = __lasx_xvld(src_rgb24, 32);
+    reg2 = __lasx_xvld(src_rgb24, 64);
+    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+              tmp1);
+    tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+    DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+              tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+    DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+              0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    __lasx_xvst(dst2, dst_argb, 64);
+    __lasx_xvst(dst3, dst_argb, 96);
+    src_rgb24 += 96;
+    dst_argb += 128;
+  }
+}
+
+void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1, src2;
+  __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
+  __m256i dst0, dst1, dst2, dst3;
+  __m256i alpha = __lasx_xvldi(0xFF);
+  __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+                   0x1B1A191817161514};
+  __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+                   0x0706050403020100};
+  __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+                   0x131211100F0E0D0C};
+  __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
+                   0x10090A0B10060708};
+
+  for (x = 0; x < len; x++) {
+    reg0 = __lasx_xvld(src_raw, 0);
+    reg1 = __lasx_xvld(src_raw, 32);
+    reg2 = __lasx_xvld(src_raw, 64);
+    src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+    src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+    src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+              tmp1);
+    tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+    DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+              tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+    DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+              0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+    __lasx_xvst(dst0, dst_argb, 0);
+    __lasx_xvst(dst1, dst_argb, 32);
+    __lasx_xvst(dst2, dst_argb, 64);
+    __lasx_xvst(dst3, dst_argb, 96);
+    src_raw += 96;
+    dst_argb += 128;
+  }
+}
+
+void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1;
+  __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m256i reg0, reg1, reg2, dst0;
+  __m256i const_66 = __lasx_xvldi(66);
+  __m256i const_129 = __lasx_xvldi(129);
+  __m256i const_25 = __lasx_xvldi(25);
+  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+                        0x1080108010801080, 0x1080108010801080};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_argb1555, 0);
+    src1 = __lasx_xvld(src_argb1555, 32);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    tmpg = __lasx_xvsrli_b(tmp0, 5);
+    reg0 = __lasx_xvandi_b(tmp1, 0x03);
+    reg0 = __lasx_xvslli_b(reg0, 3);
+    tmpg = __lasx_xvor_v(tmpg, reg0);
+    reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+    tmpr = __lasx_xvsrli_b(reg1, 2);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvslli_b(tmpg, 3);
+    reg2 = __lasx_xvslli_b(tmpr, 3);
+    tmpb = __lasx_xvsrli_b(tmpb, 2);
+    tmpg = __lasx_xvsrli_b(tmpg, 2);
+    tmpr = __lasx_xvsrli_b(tmpr, 2);
+    tmpb = __lasx_xvor_v(reg0, tmpb);
+    tmpg = __lasx_xvor_v(reg1, tmpg);
+    tmpr = __lasx_xvor_v(reg2, tmpr);
+    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+    reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+    reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+    reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+    reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+    reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+    dst0 = __lasx_xvpackod_b(reg1, reg0);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_y, 0);
+    src_argb1555 += 64;
+    dst_y += 32;
+  }
+}
+
+void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  int x;
+  int len = width / 32;
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m256i reg0, reg1, reg2, reg3, dst0;
+  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
+              next_argb1555, 32, src0, src1, src2, src3);
+    DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+    DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    nexb = __lasx_xvandi_b(tmp2, 0x1F);
+    tmpg = __lasx_xvsrli_b(tmp0, 5);
+    nexg = __lasx_xvsrli_b(tmp2, 5);
+    reg0 = __lasx_xvandi_b(tmp1, 0x03);
+    reg2 = __lasx_xvandi_b(tmp3, 0x03);
+    reg0 = __lasx_xvslli_b(reg0, 3);
+    reg2 = __lasx_xvslli_b(reg2, 3);
+    tmpg = __lasx_xvor_v(tmpg, reg0);
+    nexg = __lasx_xvor_v(nexg, reg2);
+    reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+    reg3 = __lasx_xvandi_b(tmp3, 0x7C);
+    tmpr = __lasx_xvsrli_b(reg1, 2);
+    nexr = __lasx_xvsrli_b(reg3, 2);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvslli_b(tmpg, 3);
+    reg2 = __lasx_xvslli_b(tmpr, 3);
+    tmpb = __lasx_xvsrli_b(tmpb, 2);
+    tmpg = __lasx_xvsrli_b(tmpg, 2);
+    tmpr = __lasx_xvsrli_b(tmpr, 2);
+    tmpb = __lasx_xvor_v(reg0, tmpb);
+    tmpg = __lasx_xvor_v(reg1, tmpg);
+    tmpr = __lasx_xvor_v(reg2, tmpr);
+    reg0 = __lasx_xvslli_b(nexb, 3);
+    reg1 = __lasx_xvslli_b(nexg, 3);
+    reg2 = __lasx_xvslli_b(nexr, 3);
+    nexb = __lasx_xvsrli_b(nexb, 2);
+    nexg = __lasx_xvsrli_b(nexg, 2);
+    nexr = __lasx_xvsrli_b(nexr, 2);
+    nexb = __lasx_xvor_v(reg0, nexb);
+    nexg = __lasx_xvor_v(reg1, nexg);
+    nexr = __lasx_xvor_v(reg2, nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+    reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+    reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+    dst0 = __lasx_xvpickod_b(reg1, reg0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+    src_argb1555 += 64;
+    next_argb1555 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 32;
+  __m256i src0, src1;
+  __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m256i reg0, reg1, dst0;
+  __m256i const_66 = __lasx_xvldi(66);
+  __m256i const_129 = __lasx_xvldi(129);
+  __m256i const_25 = __lasx_xvldi(25);
+  __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+                        0x1080108010801080, 0x1080108010801080};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lasx_xvld(src_rgb565, 0);
+    src1 = __lasx_xvld(src_rgb565, 32);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+    reg1 = __lasx_xvandi_b(tmp1, 0x07);
+    reg0 = __lasx_xvsrli_b(tmp0, 5);
+    reg1 = __lasx_xvslli_b(reg1, 3);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvsrli_b(tmpb, 2);
+    tmpb = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvslli_b(tmpg, 2);
+    reg1 = __lasx_xvsrli_b(tmpg, 4);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    reg0 = __lasx_xvsrli_b(tmpr, 5);
+    tmpr = __lasx_xvor_v(tmpr, reg0);
+    reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+    reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+    reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+    reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+    reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+    reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+    dst0 = __lasx_xvpackod_b(reg1, reg0);
+    dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+    __lasx_xvst(dst0, dst_y, 0);
+    dst_y += 32;
+    src_rgb565 += 64;
+  }
+}
+
+void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 32;
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  __m256i src0, src1, src2, src3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m256i reg0, reg1, reg2, reg3, dst0;
+  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
+              next_rgb565, 32, src0, src1, src2, src3);
+    DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+    DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+    tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+    tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+    nexb = __lasx_xvandi_b(tmp2, 0x1F);
+    nexr = __lasx_xvandi_b(tmp3, 0xF8);
+    reg1 = __lasx_xvandi_b(tmp1, 0x07);
+    reg3 = __lasx_xvandi_b(tmp3, 0x07);
+    reg0 = __lasx_xvsrli_b(tmp0, 5);
+    reg1 = __lasx_xvslli_b(reg1, 3);
+    reg2 = __lasx_xvsrli_b(tmp2, 5);
+    reg3 = __lasx_xvslli_b(reg3, 3);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    nexg = __lasx_xvor_v(reg2, reg3);
+    reg0 = __lasx_xvslli_b(tmpb, 3);
+    reg1 = __lasx_xvsrli_b(tmpb, 2);
+    reg2 = __lasx_xvslli_b(nexb, 3);
+    reg3 = __lasx_xvsrli_b(nexb, 2);
+    tmpb = __lasx_xvor_v(reg1, reg0);
+    nexb = __lasx_xvor_v(reg2, reg3);
+    reg0 = __lasx_xvslli_b(tmpg, 2);
+    reg1 = __lasx_xvsrli_b(tmpg, 4);
+    reg2 = __lasx_xvslli_b(nexg, 2);
+    reg3 = __lasx_xvsrli_b(nexg, 4);
+    tmpg = __lasx_xvor_v(reg1, reg0);
+    nexg = __lasx_xvor_v(reg2, reg3);
+    reg0 = __lasx_xvsrli_b(tmpr, 5);
+    reg2 = __lasx_xvsrli_b(nexr, 5);
+    tmpr = __lasx_xvor_v(tmpr, reg0);
+    nexr = __lasx_xvor_v(nexr, reg2);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+    reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+    reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+    dst0 = __lasx_xvpickod_b(reg1, reg0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+    dst_u += 16;
+    dst_v += 16;
+    src_rgb565 += 64;
+    next_rgb565 += 64;
+  }
+}
+
+void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+  int len = width / 32;
+  __m256i src0, src1, src2, reg0, reg1, reg2;
+  __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
+  __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
+                      0x15120F0C09060300, 0x00000000001E1B18};
+  __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
+                      0x0706050403020100, 0x1D1A1714110A0908};
+  __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+                      0x1613100D0A070401, 0x00000000001F1C19};
+  __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+                      0x0706050403020100, 0x1E1B1815120A0908};
+  __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A,
+                      0x1714110E0B080502, 0x0000000000001D1A};
+  __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908,
+                      0x0706050403020100, 0x1F1C191613100908};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
+              next_rgb24, 0, reg0, reg1, reg2, tmp0);
+    DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
+    DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+              0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+    DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+              nexr);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+              nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+    dst0 = __lasx_xvpickod_b(reg1, reg0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+    src_rgb24 += 96;
+    next_rgb24 += 96;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RAWToUVRow_LASX(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* next_raw = src_raw + src_stride_raw;
+  int len = width / 32;
+  __m256i src0, src1, src2, reg0, reg1, reg2;
+  __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
+  __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
+                      0x15120F0C09060300, 0x00000000001E1B18};
+  __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
+                      0x0706050403020100, 0x1D1A1714110A0908};
+  __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+                      0x1613100D0A070401, 0x00000000001F1C19};
+  __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+                      0x0706050403020100, 0x1E1B1815120A0908};
+  __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A,
+                      0x1714110E0B080502, 0x0000000000001D1A};
+  __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908,
+                      0x0706050403020100, 0x1F1C191613100908};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
+              reg0, reg1, reg2, tmp0);
+    DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
+    DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+              0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+    DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+              nexr);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+              nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+    dst0 = __lasx_xvpickod_b(reg1, reg0);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+    __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+    src_raw += 96;
+    next_raw += 96;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void NV12ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+  __m256i out_b, out_g, out_r;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+  __m256i alpha = __lasx_xvldi(0xFF);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lasx_xvld(src_y, 0);
+    vec_vu = __lasx_xvld(src_uv, 0);
+    vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+    vec_vu = __lasx_vext2xv_h_b(vec_vu);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+             out_b);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_y += 16;
+    src_uv += 16;
+  }
+}
+
+void NV12ToRGB565Row_LASX(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  int len = width / 16;
+  __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+  __m256i out_b, out_g, out_r;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lasx_xvld(src_y, 0);
+    vec_vu = __lasx_xvld(src_uv, 0);
+    vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+    vec_vu = __lasx_vext2xv_h_b(vec_vu);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+             out_b);
+    out_b = __lasx_xvsrli_h(out_b, 3);
+    out_g = __lasx_xvsrli_h(out_g, 2);
+    out_r = __lasx_xvsrli_h(out_r, 3);
+    out_g = __lasx_xvslli_h(out_g, 5);
+    out_r = __lasx_xvslli_h(out_r, 11);
+    out_r = __lasx_xvor_v(out_r, out_g);
+    out_r = __lasx_xvor_v(out_r, out_b);
+    __lasx_xvst(out_r, dst_rgb565, 0);
+    src_y += 16;
+    src_uv += 16;
+    dst_rgb565 += 32;
+  }
+}
+
+void NV21ToARGBRow_LASX(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv;
+  __m256i out_b, out_g, out_r;
+  __m256i const_0x80 = __lasx_xvldi(0x80);
+  __m256i alpha = __lasx_xvldi(0xFF);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+  vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lasx_xvld(src_y, 0);
+    vec_uv = __lasx_xvld(src_uv, 0);
+    vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
+    vec_uv = __lasx_vext2xv_h_b(vec_uv);
+    YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
+             out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_y += 16;
+    src_uv += 16;
+  }
+}
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // ARGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // GA
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr8,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr10, $xr0          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
+
+void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct RgbConstants* rgbconstants) {
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr20, %4,    0             \n\t"  // load shuff
+      "1:                                          \n\t"
+      "xvld            $xr4,  %0,    0             \n\t"
+      "xvld            $xr5,  %0,    32            \n\t"
+      "xvld            $xr6,  %0,    64            \n\t"
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // RGBA
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // AG
+      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // BR
+      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr11, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr1          \n\t"  // G
+      "xvmaddwod.h.bu  $xr13, $xr10, $xr1          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr9,  $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr11, $xr2          \n\t"
+      "addi.d          %0,    %0,    128           \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvperm.w        $xr11, $xr10, $xr20         \n\t"
+      "xvst            $xr11, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
+}
+
+void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  int8_t shuff[128] = {
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
+      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
+      "xvldrepl.h      $xr3,  %3,    4             \n\t"  // load rgbconstants
+      "xvld            $xr4,  %4,    0             \n\t"  // load shuff
+      "xvld            $xr5,  %4,    32            \n\t"
+      "xvld            $xr6,  %4,    64            \n\t"
+      "xvld            $xr7,  %4,    96            \n\t"
+      "1:                                          \n\t"
+      "xvld            $xr8,  %0,    0             \n\t"
+      "xvld            $xr9,  %0,    32            \n\t"
+      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of
+                                                          // RGB
+      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
+      "xvor.v          $xr11, $xr9,  $xr9          \n\t"
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  // src0
+      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  // src1
+      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  // src2
+      "xvshuf.b        $xr14, $xr8,  $xr9,  $xr4   \n\t"
+      "xvshuf.b        $xr15, $xr8,  $xr10, $xr5   \n\t"
+      "xvshuf.b        $xr16, $xr8,  $xr9,  $xr6   \n\t"
+      "xvshuf.b        $xr17, $xr8,  $xr10, $xr7   \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr16, $xr1          \n\t"  // G
+      "xvmaddwev.h.bu  $xr13, $xr17, $xr1          \n\t"
+      "xvmaddwev.h.bu  $xr12, $xr14, $xr0          \n\t"  // B
+      "xvmaddwev.h.bu  $xr13, $xr15, $xr0          \n\t"
+      "xvmaddwod.h.bu  $xr12, $xr14, $xr2          \n\t"  // R
+      "xvmaddwod.h.bu  $xr13, $xr15, $xr2          \n\t"
+      "addi.d          %0,    %0,    96            \n\t"
+      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
+      "xvst            $xr10, %1,    0             \n\t"
+      "addi.d          %1,    %1,    32            \n\t"
+      "bnez            %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+void ARGBToUVJRow_LASX(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  const uint8_t* next_argb = src_argb + src_stride_argb;
+  int len = width / 32;
+  __m256i src0, src1, src2, src3;
+  __m256i nex0, nex1, nex2, nex3;
+  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i reg0, reg1, dst0;
+  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m256i const_63 = __lasx_xvldi(0x43F);
+  __m256i const_42 = __lasx_xvldi(0x42A);
+  __m256i const_21 = __lasx_xvldi(0x415);
+  __m256i const_53 = __lasx_xvldi(0x435);
+  __m256i const_10 = __lasx_xvldi(0x40A);
+  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+                                      0x8080808080808080, 0x8080808080808080};
+  __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
+                   0x1F1D0F0D1B190B09};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+              96, src0, src1, src2, src3);
+    DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64,
+              next_argb, 96, nex0, nex1, nex2, nex3);
+    tmp0 = __lasx_xvpickev_b(src1, src0);
+    tmp1 = __lasx_xvpickod_b(src1, src0);
+    tmp2 = __lasx_xvpickev_b(src3, src2);
+    tmp3 = __lasx_xvpickod_b(src3, src2);
+    tmpr = __lasx_xvpickod_b(tmp2, tmp0);
+    tmpb = __lasx_xvpickev_b(tmp2, tmp0);
+    tmpg = __lasx_xvpickev_b(tmp3, tmp1);
+    tmp0 = __lasx_xvpickev_b(nex1, nex0);
+    tmp1 = __lasx_xvpickod_b(nex1, nex0);
+    tmp2 = __lasx_xvpickev_b(nex3, nex2);
+    tmp3 = __lasx_xvpickod_b(nex3, nex2);
+    nexr = __lasx_xvpickod_b(tmp2, tmp0);
+    nexb = __lasx_xvpickev_b(tmp2, tmp0);
+    nexg = __lasx_xvpickev_b(tmp3, tmp1);
+    tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb);
+    tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb);
+    tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg);
+    tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
+    reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
+    reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
+    tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
+    tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
+    tmpr = __lasx_xvavgr_hu(reg0, reg1);
+    reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
+    reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
+    reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
+    reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
+    reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
+    reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
+    dst0 = __lasx_xvpackod_b(reg1, reg0);
+    tmp0 = __lasx_xvpermi_d(dst0, 0x44);
+    tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
+    dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff);
+    __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+    __lasx_xvstelm_d(dst0, dst_v, 0, 2);
+    __lasx_xvstelm_d(dst0, dst_u, 8, 1);
+    __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+    dst_u += 16;
+    dst_v += 16;
+    src_argb += 128;
+    next_argb += 128;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
new file mode 100644
index 00000000..fa088c9e
--- /dev/null
+++ b/source/row_lsx.cc
@@ -0,0 +1,2987 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
+  {                                                      \
+    ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]);        \
+    vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]);        \
+    ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]);        \
+    vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]);        \
+    yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]);       \
+    yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]);   \
+  }
+
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+  {                                                             \
+    __m128i temp0, temp1;                                       \
+                                                                \
+    DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0);   \
+    temp1 = __lsx_vld(psrc_v, 0);                               \
+    temp0 = __lsx_vsub_b(temp0, const_80);                      \
+    temp1 = __lsx_vsub_b(temp1, const_80);                      \
+    temp0 = __lsx_vsllwil_h_b(temp0, 0);                        \
+    temp1 = __lsx_vsllwil_h_b(temp1, 0);                        \
+    uv_l = __lsx_vilvl_h(temp0, temp1);                         \
+    uv_h = __lsx_vilvh_h(temp0, temp1);                         \
+  }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+  {                                                   \
+    __m128i temp0, temp1;                             \
+                                                      \
+    out_y = __lsx_vld(psrc_y, 0);                     \
+    temp0 = __lsx_vldrepl_d(psrc_u, 0);               \
+    temp1 = __lsx_vldrepl_d(psrc_v, 0);               \
+    uv = __lsx_vilvl_b(temp0, temp1);                 \
+    uv = __lsx_vsub_b(uv, const_80);                  \
+    uv = __lsx_vsllwil_h_b(uv, 0);                    \
+  }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+                   g_h, r_l, r_h)                                           \
+  {                                                                         \
+    __m128i u_l, u_h, v_l, v_h;                                             \
+    __m128i yl_ev, yl_od, yh_ev, yh_od;                                     \
+    __m128i temp0, temp1, temp2, temp3;                                     \
+                                                                            \
+    temp0 = __lsx_vilvl_b(in_y, in_y);                                      \
+    temp1 = __lsx_vilvh_b(in_y, in_y);                                      \
+    yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg);                                \
+    yl_od = __lsx_vmulwod_w_hu_h(temp0, yg);                                \
+    yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg);                                \
+    yh_od = __lsx_vmulwod_w_hu_h(temp1, yg);                                \
+    DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16,    \
+              yl_ev, yl_od, yh_ev, yh_od);                                  \
+    yl_ev = __lsx_vadd_w(yl_ev, yb);                                        \
+    yl_od = __lsx_vadd_w(yl_od, yb);                                        \
+    yh_ev = __lsx_vadd_w(yh_ev, yb);                                        \
+    yh_od = __lsx_vadd_w(yh_od, yb);                                        \
+    v_l = __lsx_vmulwev_w_h(in_uvl, ubvr);                                  \
+    u_l = __lsx_vmulwod_w_h(in_uvl, ubvr);                                  \
+    v_h = __lsx_vmulwev_w_h(in_uvh, ubvr);                                  \
+    u_h = __lsx_vmulwod_w_h(in_uvh, ubvr);                                  \
+    temp0 = __lsx_vadd_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    b_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    b_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    temp0 = __lsx_vadd_w(yl_ev, v_l);                                       \
+    temp1 = __lsx_vadd_w(yl_od, v_l);                                       \
+    temp2 = __lsx_vadd_w(yh_ev, v_h);                                       \
+    temp3 = __lsx_vadd_w(yh_od, v_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    r_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    r_h = __lsx_vpackev_h(temp3, temp2);                                    \
+    DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h);        \
+    temp0 = __lsx_vsub_w(yl_ev, u_l);                                       \
+    temp1 = __lsx_vsub_w(yl_od, u_l);                                       \
+    temp2 = __lsx_vsub_w(yh_ev, u_h);                                       \
+    temp3 = __lsx_vsub_w(yh_od, u_h);                                       \
+    DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+              temp1, temp2, temp3);                                         \
+    DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
+              temp2, temp3);                                                \
+    g_l = __lsx_vpackev_h(temp1, temp0);                                    \
+    g_h = __lsx_vpackev_h(temp3, temp2);                                    \
+  }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
+  {                                                                    \
+    __m128i y_ev, y_od, u_l, v_l;                                      \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                    \
+                                                                       \
+    tmp0 = __lsx_vilvl_b(in_y, in_y);                                  \
+    y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg);                             \
+    y_od = __lsx_vmulwod_w_hu_h(tmp0, yg);                             \
+    y_ev = __lsx_vsrai_w(y_ev, 16);                                    \
+    y_od = __lsx_vsrai_w(y_od, 16);                                    \
+    y_ev = __lsx_vadd_w(y_ev, yb);                                     \
+    y_od = __lsx_vadd_w(y_od, yb);                                     \
+    in_vu = __lsx_vilvl_b(zero, in_vu);                                \
+    in_vu = __lsx_vsub_h(in_vu, const_80);                             \
+    u_l = __lsx_vmulwev_w_h(in_vu, vrub);                              \
+    v_l = __lsx_vmulwod_w_h(in_vu, vrub);                              \
+    tmp0 = __lsx_vadd_w(y_ev, u_l);                                    \
+    tmp1 = __lsx_vadd_w(y_od, u_l);                                    \
+    tmp2 = __lsx_vadd_w(y_ev, v_l);                                    \
+    tmp3 = __lsx_vadd_w(y_od, v_l);                                    \
+    tmp0 = __lsx_vsrai_w(tmp0, 6);                                     \
+    tmp1 = __lsx_vsrai_w(tmp1, 6);                                     \
+    tmp2 = __lsx_vsrai_w(tmp2, 6);                                     \
+    tmp3 = __lsx_vsrai_w(tmp3, 6);                                     \
+    tmp0 = __lsx_vclip255_w(tmp0);                                     \
+    tmp1 = __lsx_vclip255_w(tmp1);                                     \
+    tmp2 = __lsx_vclip255_w(tmp2);                                     \
+    tmp3 = __lsx_vclip255_w(tmp3);                                     \
+    out_b = __lsx_vpackev_h(tmp1, tmp0);                               \
+    out_r = __lsx_vpackev_h(tmp3, tmp2);                               \
+    tmp0 = __lsx_vdp2_w_h(in_vu, vgug);                                \
+    tmp1 = __lsx_vsub_w(y_ev, tmp0);                                   \
+    tmp2 = __lsx_vsub_w(y_od, tmp0);                                   \
+    tmp1 = __lsx_vsrai_w(tmp1, 6);                                     \
+    tmp2 = __lsx_vsrai_w(tmp2, 6);                                     \
+    tmp1 = __lsx_vclip255_w(tmp1);                                     \
+    tmp2 = __lsx_vclip255_w(tmp2);                                     \
+    out_g = __lsx_vpackev_h(tmp2, tmp1);                               \
+  }
+
+// Convert I444 pixels of YUV420 to RGB.
+#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
+                  out_r)                                                 \
+  {                                                                      \
+    __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od;                          \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                      \
+                                                                         \
+    y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg);                              \
+    y_od = __lsx_vmulwod_w_hu_h(in_yy, yg);                              \
+    y_ev = __lsx_vsrai_w(y_ev, 16);                                      \
+    y_od = __lsx_vsrai_w(y_od, 16);                                      \
+    y_ev = __lsx_vadd_w(y_ev, yb);                                       \
+    y_od = __lsx_vadd_w(y_od, yb);                                       \
+    in_u = __lsx_vsub_h(in_u, const_80);                                 \
+    in_v = __lsx_vsub_h(in_v, const_80);                                 \
+    u_ev = __lsx_vmulwev_w_h(in_u, ub);                                  \
+    u_od = __lsx_vmulwod_w_h(in_u, ub);                                  \
+    v_ev = __lsx_vmulwev_w_h(in_v, vr);                                  \
+    v_od = __lsx_vmulwod_w_h(in_v, vr);                                  \
+    tmp0 = __lsx_vadd_w(y_ev, u_ev);                                     \
+    tmp1 = __lsx_vadd_w(y_od, u_od);                                     \
+    tmp2 = __lsx_vadd_w(y_ev, v_ev);                                     \
+    tmp3 = __lsx_vadd_w(y_od, v_od);                                     \
+    tmp0 = __lsx_vsrai_w(tmp0, 6);                                       \
+    tmp1 = __lsx_vsrai_w(tmp1, 6);                                       \
+    tmp2 = __lsx_vsrai_w(tmp2, 6);                                       \
+    tmp3 = __lsx_vsrai_w(tmp3, 6);                                       \
+    tmp0 = __lsx_vclip255_w(tmp0);                                       \
+    tmp1 = __lsx_vclip255_w(tmp1);                                       \
+    tmp2 = __lsx_vclip255_w(tmp2);                                       \
+    tmp3 = __lsx_vclip255_w(tmp3);                                       \
+    out_b = __lsx_vpackev_h(tmp1, tmp0);                                 \
+    out_r = __lsx_vpackev_h(tmp3, tmp2);                                 \
+    u_ev = __lsx_vpackev_h(in_u, in_v);                                  \
+    u_od = __lsx_vpackod_h(in_u, in_v);                                  \
+    v_ev = __lsx_vdp2_w_h(u_ev, ugvg);                                   \
+    v_od = __lsx_vdp2_w_h(u_od, ugvg);                                   \
+    tmp0 = __lsx_vsub_w(y_ev, v_ev);                                     \
+    tmp1 = __lsx_vsub_w(y_od, v_od);                                     \
+    tmp0 = __lsx_vsrai_w(tmp0, 6);                                       \
+    tmp1 = __lsx_vsrai_w(tmp1, 6);                                       \
+    tmp0 = __lsx_vclip255_w(tmp0);                                       \
+    tmp1 = __lsx_vclip255_w(tmp1);                                       \
+    out_g = __lsx_vpackev_h(tmp1, tmp0);                                 \
+  }
+
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+  {                                                                    \
+    __m128i temp0, temp1, temp2, temp3;                                \
+    temp0 = __lsx_vpackev_b(g_l, b_l);                                 \
+    temp1 = __lsx_vpackev_b(a_l, r_l);                                 \
+    temp2 = __lsx_vpackev_b(g_h, b_h);                                 \
+    temp3 = __lsx_vpackev_b(a_h, r_h);                                 \
+    r_l = __lsx_vilvl_h(temp1, temp0);                                 \
+    r_h = __lsx_vilvh_h(temp1, temp0);                                 \
+    g_l = __lsx_vilvl_h(temp3, temp2);                                 \
+    g_h = __lsx_vilvh_h(temp3, temp2);                                 \
+    __lsx_vst(r_l, pdst_argb, 0);                                      \
+    __lsx_vst(r_h, pdst_argb, 16);                                     \
+    __lsx_vst(g_l, pdst_argb, 32);                                     \
+    __lsx_vst(g_h, pdst_argb, 48);                                     \
+    pdst_argb += 64;                                                   \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+  {                                                  \
+    __m128i temp0, temp1;                            \
+    __m128i dst0, dst1;                              \
+                                                     \
+    temp0 = __lsx_vpackev_b(in_g, in_b);             \
+    temp1 = __lsx_vpackev_b(in_a, in_r);             \
+    dst0 = __lsx_vilvl_h(temp1, temp0);              \
+    dst1 = __lsx_vilvh_h(temp1, temp0);              \
+    __lsx_vst(dst0, pdst_argb, 0);                   \
+    __lsx_vst(dst1, pdst_argb, 16);                  \
+    pdst_argb += 32;                                 \
+  }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+  {                                                              \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                          \
+    __m128i _reg0, _reg1;                                        \
+    _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb);                    \
+    _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb);                    \
+    _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg);                    \
+    _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg);                    \
+    _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr);                    \
+    _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr);                    \
+    _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1);                        \
+    _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3);                        \
+    _tmpr = __lsx_vavgr_hu(_reg0, _reg1);                        \
+    _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb);         \
+    _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr);         \
+    _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg);               \
+    _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg);               \
+    _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr);               \
+    _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb);               \
+    _dst0 = __lsx_vpickod_b(_reg1, _reg0);                       \
+  }
+
+void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1;
+  __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
+  src += width - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src, dst;
+  __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
+
+  src_uv += (width - 8) << 1;
+  for (x = 0; x < len; x++) {
+    src = __lsx_vld(src_uv, 0);
+    dst = __lsx_vshuf_h(shuffler, src, src);
+    __lsx_vst(dst, dst_uv, 0);
+    src_uv -= 16;
+    dst_uv += 16;
+  }
+}
+
+void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1;
+  __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
+
+  src += (width * 4) - 32;
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+              src1);
+    __lsx_vst(src1, dst, 0);
+    __lsx_vst(src0, dst, 16);
+    dst += 32;
+    src -= 32;
+  }
+}
+
+void I422ToYUY2Row_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_yuy2_0, vec_yuy2_1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
+    vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
+    __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
+    __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_yuy2 += 32;
+  }
+}
+
+void I422ToUYVYRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_u0, src_v0, src_y0, vec_uv0;
+  __m128i vec_uyvy0, vec_uyvy1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
+    src_y0 = __lsx_vld(src_y, 0);
+    vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
+    vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
+    vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
+    __lsx_vst(vec_uyvy0, dst_uyvy, 0);
+    __lsx_vst(vec_uyvy1, dst_uyvy, 16);
+    src_u += 8;
+    src_v += 8;
+    src_y += 16;
+    dst_uyvy += 32;
+  }
+}
+
+void I422ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToRGBARow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int len = width / 16;
+  int res = width & 15;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i zero = __lsx_vldi(0);
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+    y = __lsx_vld(src_a, 0);
+    a_l = __lsx_vilvl_b(zero, y);
+    a_h = __lsx_vilvh_b(zero, y);
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    src_a += 16;
+  }
+  if (res) {
+    __m128i y, uv, r, g, b, a;
+    a = __lsx_vld(src_a, 0);
+    a = __lsx_vsllwil_hu_bu(a, 0);
+    READYUV422(src_y, src_u, src_v, y, uv);
+    YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+    STOREARGB(a, r, g, b, dst_argb);
+  }
+}
+
+void I422ToRGB24Row_LSX(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
+  __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+    __m128i temp0, temp1, temp2, temp3;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    temp0 = __lsx_vpackev_b(g_l, b_l);
+    temp1 = __lsx_vpackev_b(g_h, b_h);
+    DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
+              temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+              temp1);
+
+    b_l = __lsx_vilvl_d(temp1, temp2);
+    b_h = __lsx_vilvh_d(temp3, temp1);
+    __lsx_vst(temp0, dst_argb, 0);
+    __lsx_vst(b_l, dst_argb, 16);
+    __lsx_vst(b_h, dst_argb, 32);
+    dst_argb += 48;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 2);
+    g_h = __lsx_vsrli_h(g_h, 2);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 11);
+    r_h = __lsx_vslli_h(r_h, 11);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_rgb565, 0);
+    __lsx_vst(r_h, dst_rgb565, 16);
+    dst_rgb565 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000};
+  __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 4);
+    b_h = __lsx_vsrli_h(b_h, 4);
+    r_l = __lsx_vsrli_h(r_l, 4);
+    r_h = __lsx_vsrli_h(r_h, 4);
+    g_l = __lsx_vand_v(g_l, mask);
+    g_h = __lsx_vand_v(g_h, mask);
+    r_l = __lsx_vslli_h(r_l, 8);
+    r_h = __lsx_vslli_h(r_h, 8);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb4444, 0);
+    __lsx_vst(r_h, dst_argb4444, 16);
+    dst_argb4444 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void I422ToARGB1555Row_LSX(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x80);
+  __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+    READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+    YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+               g_h, r_l, r_h);
+    b_l = __lsx_vsrli_h(b_l, 3);
+    b_h = __lsx_vsrli_h(b_h, 3);
+    g_l = __lsx_vsrli_h(g_l, 3);
+
+    g_h = __lsx_vsrli_h(g_h, 3);
+    g_l = __lsx_vslli_h(g_l, 5);
+    g_h = __lsx_vslli_h(g_h, 5);
+    r_l = __lsx_vsrli_h(r_l, 3);
+    r_h = __lsx_vsrli_h(r_h, 3);
+    r_l = __lsx_vslli_h(r_l, 10);
+    r_h = __lsx_vslli_h(r_h, 10);
+    r_l = __lsx_vor_v(r_l, alpha);
+    r_h = __lsx_vor_v(r_h, alpha);
+    r_l = __lsx_vor_v(r_l, g_l);
+    r_h = __lsx_vor_v(r_h, g_h);
+    r_l = __lsx_vor_v(r_l, b_l);
+    r_h = __lsx_vor_v(r_h, b_h);
+    __lsx_vst(r_l, dst_argb1555, 0);
+    __lsx_vst(r_h, dst_argb1555, 16);
+    dst_argb1555 += 32;
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+  }
+}
+
+void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    dst0 = __lsx_vpickev_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_yuy2 += 32;
+    dst_y += 16;
+  }
+}
+
+void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
+              src_yuy2_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickod_b(src1, src0);
+    src1 = __lsx_vpickod_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    src_yuy2_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_yuy2 += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    dst0 = __lsx_vpickod_b(src1, src0);
+    __lsx_vst(dst0, dst_y, 0);
+    src_uyvy += 32;
+    dst_y += 16;
+  }
+}
+
+void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
+              src_uyvy_next, 16, src0, src1, src2, src3);
+    src0 = __lsx_vpickev_b(src1, src0);
+    src1 = __lsx_vpickev_b(src3, src2);
+    tmp0 = __lsx_vavgr_bu(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    src_uyvy_next += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    dst0 = __lsx_vpickev_b(tmp0, tmp0);
+    dst1 = __lsx_vpickod_b(tmp0, tmp0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst1, dst_v, 0, 0);
+    src_uyvy += 32;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToUVRow_LSX(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  int len = width / 16;
+  const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i vec0, vec1, vec2, vec3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+  __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
+  __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
+  __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
+  __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
+  __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
+  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
+              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
+              48, src4, src5, src6, src7);
+    vec0 = __lsx_vaddwev_h_bu(src0, src4);
+    vec1 = __lsx_vaddwev_h_bu(src1, src5);
+    vec2 = __lsx_vaddwev_h_bu(src2, src6);
+    vec3 = __lsx_vaddwev_h_bu(src3, src7);
+    tmp0 = __lsx_vpickev_h(vec1, vec0);
+    tmp1 = __lsx_vpickev_h(vec3, vec2);
+    tmp2 = __lsx_vpickod_h(vec1, vec0);
+    tmp3 = __lsx_vpickod_h(vec3, vec2);
+    vec0 = __lsx_vaddwod_h_bu(src0, src4);
+    vec1 = __lsx_vaddwod_h_bu(src1, src5);
+    vec2 = __lsx_vaddwod_h_bu(src2, src6);
+    vec3 = __lsx_vaddwod_h_bu(src3, src7);
+    tmp4 = __lsx_vpickev_h(vec1, vec0);
+    tmp5 = __lsx_vpickev_h(vec3, vec2);
+    vec0 = __lsx_vpickev_h(tmp1, tmp0);
+    vec1 = __lsx_vpickod_h(tmp1, tmp0);
+    src0 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp3, tmp2);
+    vec1 = __lsx_vpickod_h(tmp3, tmp2);
+    src1 = __lsx_vavgr_h(vec0, vec1);
+    vec0 = __lsx_vpickev_h(tmp5, tmp4);
+    vec1 = __lsx_vpickod_h(tmp5, tmp4);
+    src2 = __lsx_vavgr_h(vec0, vec1);
+    dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
+    dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
+    dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
+    dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
+    dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
+    dst0 = __lsx_vsrai_h(dst0, 8);
+    dst1 = __lsx_vsrai_h(dst1, 8);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    src_argb0 += 64;
+    src_argb1 += 64;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = (width / 16) - 1;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+    tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+    tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+    tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+    __lsx_vst(tmp0, dst_rgb, 0);
+    __lsx_vst(tmp1, dst_rgb, 12);
+    __lsx_vst(tmp2, dst_rgb, 24);
+    __lsx_vst(tmp3, dst_rgb, 36);
+    dst_rgb += 48;
+    src_argb += 64;
+  }
+  DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+            src0, src1, src2, src3);
+  tmp0 = __lsx_vshuf_b(src0, src0, shuf);
+  tmp1 = __lsx_vshuf_b(src1, src1, shuf);
+  tmp2 = __lsx_vshuf_b(src2, src2, shuf);
+  tmp3 = __lsx_vshuf_b(src3, src3, shuf);
+  __lsx_vst(tmp0, dst_rgb, 0);
+  __lsx_vst(tmp1, dst_rgb, 12);
+  __lsx_vst(tmp2, dst_rgb, 24);
+  dst_rgb += 36;
+  __lsx_vst(tmp3, dst_rgb, 0);
+}
+
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i shift = {0x0300030003000300, 0x0300030003000300};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vpackev_b(zero, tmp1);
+    tmp1 = __lsx_vsrli_h(tmp1, 2);
+    tmp0 = __lsx_vsll_b(tmp0, shift);
+    tmp1 = __lsx_vslli_h(tmp1, 5);
+    dst0 = __lsx_vor_v(tmp0, tmp1);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
+  __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp0 = __lsx_vsrli_b(tmp0, 3);
+    tmp1 = __lsx_vsrl_b(tmp1, shift1);
+    tmp0 = __lsx_vsll_b(tmp0, shift2);
+    tmp2 = __lsx_vpackev_b(zero, tmp1);
+    tmp3 = __lsx_vpackod_b(zero, tmp1);
+    tmp2 = __lsx_vslli_h(tmp2, 5);
+    tmp3 = __lsx_vslli_h(tmp3, 15);
+    dst0 = __lsx_vor_v(tmp0, tmp2);
+    dst0 = __lsx_vor_v(dst0, tmp3);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vandi_b(tmp1, 0xF0);
+    tmp0 = __lsx_vsrli_b(tmp0, 4);
+    dst0 = __lsx_vor_v(tmp1, tmp0);
+    __lsx_vst(dst0, dst_rgb, 0);
+    dst_rgb += 16;
+    src_argb += 32;
+  }
+}
+
+void ARGBToUV444Row_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, dst0, dst1;
+  __m128i const_112 = __lsx_vldi(112);
+  __m128i const_74 = __lsx_vldi(74);
+  __m128i const_38 = __lsx_vldi(38);
+  __m128i const_94 = __lsx_vldi(94);
+  __m128i const_18 = __lsx_vldi(18);
+  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vpickev_h(src1, src0);
+    tmp1 = __lsx_vpickod_h(src1, src0);
+    tmp2 = __lsx_vpickev_h(src3, src2);
+    tmp3 = __lsx_vpickod_h(src3, src2);
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
+    reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
+    reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
+    reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst0 = __lsx_vpickev_b(reg1, reg0);
+
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
+    reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
+    reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
+    reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
+    reg0 = __lsx_vsub_h(reg0, reg2);
+    reg1 = __lsx_vsub_h(reg1, reg3);
+    reg0 = __lsx_vsrai_h(reg0, 8);
+    reg1 = __lsx_vsrai_h(reg1, 8);
+    dst1 = __lsx_vpickev_b(reg1, reg0);
+
+    __lsx_vst(dst0, dst_u, 0);
+    __lsx_vst(dst1, dst_v, 0);
+    dst_u += 16;
+    dst_v += 16;
+    src_argb += 64;
+  }
+}
+
+void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i zero = __lsx_vldi(0);
+  __m128i src0, src1, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp2 = __lsx_vilvl_b(zero, src1);
+    tmp3 = __lsx_vilvh_b(zero, src1);
+    dst0 = __lsx_vmuh_hu(tmp0, tmp2);
+    dst1 = __lsx_vmuh_hu(tmp1, tmp3);
+    dst0 = __lsx_vpickev_b(dst1, dst0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_LSX(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vsadd_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 4;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
+    dst0 = __lsx_vssub_bu(src0, src1);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i b, g, r, a, dst0, dst1;
+  __m128i control = {0x0005000100040000, 0x0007000300060002};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(tmp0, tmp0);
+    r = __lsx_vpackod_b(tmp0, tmp0);
+    g = __lsx_vpackev_b(tmp1, tmp1);
+    a = __lsx_vpackod_b(tmp1, tmp1);
+    reg0 = __lsx_vmulwev_w_hu(b, a);
+    reg1 = __lsx_vmulwod_w_hu(b, a);
+    reg2 = __lsx_vmulwev_w_hu(r, a);
+    reg3 = __lsx_vmulwod_w_hu(r, a);
+    reg4 = __lsx_vmulwev_w_hu(g, a);
+    reg5 = __lsx_vmulwod_w_hu(g, a);
+    reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
+    reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
+    reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
+    reg0 = __lsx_vshuf_h(control, reg0, reg0);
+    reg2 = __lsx_vshuf_h(control, reg2, reg2);
+    reg4 = __lsx_vshuf_h(control, reg4, reg4);
+    tmp0 = __lsx_vpackev_b(reg4, reg0);
+    tmp1 = __lsx_vpackev_b(a, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+    src_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+  __m128i b, g, r;
+  __m128i zero = __lsx_vldi(0);
+  __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
+
+  vec_dither = __lsx_vilvl_b(zero, vec_dither);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    b = __lsx_vpackev_b(zero, tmp0);
+    r = __lsx_vpackod_b(zero, tmp0);
+    g = __lsx_vpackev_b(zero, tmp1);
+    b = __lsx_vadd_h(b, vec_dither);
+    g = __lsx_vadd_h(g, vec_dither);
+    r = __lsx_vadd_h(r, vec_dither);
+    DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
+    r = __lsx_vclip255_h(r);
+    b = __lsx_vsrai_h(b, 3);
+    g = __lsx_vsrai_h(g, 2);
+    r = __lsx_vsrai_h(r, 3);
+    g = __lsx_vslli_h(g, 5);
+    r = __lsx_vslli_h(r, 11);
+    dst0 = __lsx_vor_v(b, g);
+    dst0 = __lsx_vor_v(dst0, r);
+    __lsx_vst(dst0, dst_rgb, 0);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_LSX(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, dst0, dst1;
+  __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
+  __m128i temp = __lsx_vldrepl_w(shuffler, 0);
+
+  shuf = __lsx_vadd_b(shuf, temp);
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    dst0 = __lsx_vshuf_b(src0, src0, shuf);
+    dst1 = __lsx_vshuf_b(src1, src1, shuf);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_LSX(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  int len = width / 4;
+  __m128i src0, dst0, tmp0, tmp1;
+  __m128i vec_value = __lsx_vreplgr2vr_w(value);
+
+  vec_value = __lsx_vilvl_b(vec_value, vec_value);
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb, 0);
+    tmp0 = __lsx_vilvl_b(src0, src0);
+    tmp1 = __lsx_vilvh_b(src0, src0);
+    tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
+    tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
+    dst0 = __lsx_vpickod_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, reg2, dst0, dst1;
+  __m128i const_128 = __lsx_vldi(0x480);
+  __m128i const_150 = __lsx_vldi(0x96);
+  __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
+    reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
+    reg2 = __lsx_vadd_h(reg0, reg1);
+    tmp0 = __lsx_vpackod_b(reg2, reg2);
+    tmp1 = __lsx_vpackod_b(tmp1, reg2);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1;
+  __m128i reg0, reg1, spb, spg, spr;
+  __m128i dst0, dst1;
+  __m128i spb_g = __lsx_vldi(68);
+  __m128i spg_g = __lsx_vldi(88);
+  __m128i spr_g = __lsx_vldi(98);
+  __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
+  __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
+  __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
+  __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+    spr = __lsx_vdp2_h_bu(tmp0, spr_br);
+    spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
+    spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
+    spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
+    spb = __lsx_vsrli_h(spb, 7);
+    spg = __lsx_vsrli_h(spg, 7);
+    spr = __lsx_vsrli_h(spr, 7);
+    spg = __lsx_vsat_hu(spg, 7);
+    spr = __lsx_vsat_hu(spr, 7);
+    reg0 = __lsx_vpackev_b(spg, spb);
+    reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
+    dst0 = __lsx_vilvl_h(reg1, reg0);
+    dst1 = __lsx_vilvh_h(reg1, reg0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb4444, 0);
+    src1 = __lsx_vld(src_argb4444, 16);
+    tmp0 = __lsx_vandi_b(src0, 0x0F);
+    tmp1 = __lsx_vandi_b(src0, 0xF0);
+    tmp2 = __lsx_vandi_b(src1, 0x0F);
+    tmp3 = __lsx_vandi_b(src1, 0xF0);
+    reg0 = __lsx_vslli_b(tmp0, 4);
+    reg2 = __lsx_vslli_b(tmp2, 4);
+    reg1 = __lsx_vsrli_b(tmp1, 4);
+    reg3 = __lsx_vsrli_b(tmp3, 4);
+    DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
+              tmp1, tmp2, tmp3);
+    dst0 = __lsx_vilvl_b(tmp1, tmp0);
+    dst2 = __lsx_vilvl_b(tmp3, tmp2);
+    dst1 = __lsx_vilvh_b(tmp1, tmp0);
+    dst3 = __lsx_vilvh_b(tmp3, tmp2);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_argb4444 += 32;
+  }
+}
+
+void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1;
+  __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+  __m128i reg0, reg1, reg2;
+  __m128i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb1555, 0);
+    src1 = __lsx_vld(src_argb1555, 16);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    tmpg = __lsx_vsrli_b(tmp0, 5);
+    reg0 = __lsx_vandi_b(tmp1, 0x03);
+    reg0 = __lsx_vslli_b(reg0, 3);
+    tmpg = __lsx_vor_v(tmpg, reg0);
+    reg1 = __lsx_vandi_b(tmp1, 0x7C);
+    tmpr = __lsx_vsrli_b(reg1, 2);
+    tmpa = __lsx_vsrli_b(tmp1, 7);
+    tmpa = __lsx_vneg_b(tmpa);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vslli_b(tmpg, 3);
+    reg2 = __lsx_vslli_b(tmpr, 3);
+    tmpb = __lsx_vsrli_b(tmpb, 2);
+    tmpg = __lsx_vsrli_b(tmpg, 2);
+    tmpr = __lsx_vsrli_b(tmpr, 2);
+    tmpb = __lsx_vor_v(reg0, tmpb);
+    tmpg = __lsx_vor_v(reg1, tmpg);
+    tmpr = __lsx_vor_v(reg2, tmpr);
+    DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+    dst0 = __lsx_vilvl_h(reg1, reg0);
+    dst1 = __lsx_vilvh_h(reg1, reg0);
+    DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+    dst2 = __lsx_vilvl_h(reg1, reg0);
+    dst3 = __lsx_vilvh_h(reg1, reg0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_argb1555 += 32;
+  }
+}
+
+void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1;
+  __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m128i reg0, reg1, dst0, dst1, dst2, dst3;
+  __m128i alpha = __lsx_vldi(0xFF);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_rgb565, 0);
+    src1 = __lsx_vld(src_rgb565, 16);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    tmpr = __lsx_vandi_b(tmp1, 0xF8);
+    reg1 = __lsx_vandi_b(tmp1, 0x07);
+    reg0 = __lsx_vsrli_b(tmp0, 5);
+    reg1 = __lsx_vslli_b(reg1, 3);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vsrli_b(tmpb, 2);
+    tmpb = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vslli_b(tmpg, 2);
+    reg1 = __lsx_vsrli_b(tmpg, 4);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vsrli_b(tmpr, 5);
+    tmpr = __lsx_vor_v(tmpr, reg0);
+    DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+    dst0 = __lsx_vilvl_h(reg1, reg0);
+    dst1 = __lsx_vilvh_h(reg1, reg0);
+    DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+    dst2 = __lsx_vilvl_h(reg1, reg0);
+    dst3 = __lsx_vilvh_h(reg1, reg0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_rgb565 += 32;
+  }
+}
+
+void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2;
+  __m128i tmp0, tmp1, tmp2;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+  __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+  __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+  __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_rgb24, 0);
+    src1 = __lsx_vld(src_rgb24, 16);
+    src2 = __lsx_vld(src_rgb24, 32);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+    tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+    DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+              tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_rgb24 += 48;
+  }
+}
+
+void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2;
+  __m128i tmp0, tmp1, tmp2;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+  __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+  __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+  __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_raw, 0);
+    src1 = __lsx_vld(src_raw, 16);
+    src2 = __lsx_vld(src_raw, 32);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+    tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+    DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+              tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_raw += 48;
+  }
+}
+
+void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1;
+  __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m128i reg0, reg1, reg2, dst0;
+  __m128i const_66 = __lsx_vldi(66);
+  __m128i const_129 = __lsx_vldi(129);
+  __m128i const_25 = __lsx_vldi(25);
+  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_argb1555, 0);
+    src1 = __lsx_vld(src_argb1555, 16);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    tmpg = __lsx_vsrli_b(tmp0, 5);
+    reg0 = __lsx_vandi_b(tmp1, 0x03);
+    reg0 = __lsx_vslli_b(reg0, 3);
+    tmpg = __lsx_vor_v(tmpg, reg0);
+    reg1 = __lsx_vandi_b(tmp1, 0x7C);
+    tmpr = __lsx_vsrli_b(reg1, 2);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vslli_b(tmpg, 3);
+    reg2 = __lsx_vslli_b(tmpr, 3);
+    tmpb = __lsx_vsrli_b(tmpb, 2);
+    tmpg = __lsx_vsrli_b(tmpg, 2);
+    tmpr = __lsx_vsrli_b(tmpr, 2);
+    tmpb = __lsx_vor_v(reg0, tmpb);
+    tmpg = __lsx_vor_v(reg1, tmpg);
+    tmpr = __lsx_vor_v(reg2, tmpr);
+    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+    reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+    reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+    reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+    reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+    dst0 = __lsx_vpackod_b(reg1, reg0);
+    __lsx_vst(dst0, dst_y, 0);
+    dst_y += 16;
+    src_argb1555 += 32;
+  }
+}
+
+void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  int len = width / 16;
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i reg0, reg1, reg2, reg3, dst0;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
+              next_argb1555, 16, src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    nexb = __lsx_vandi_b(tmp2, 0x1F);
+    tmpg = __lsx_vsrli_b(tmp0, 5);
+    nexg = __lsx_vsrli_b(tmp2, 5);
+    reg0 = __lsx_vandi_b(tmp1, 0x03);
+    reg2 = __lsx_vandi_b(tmp3, 0x03);
+    reg0 = __lsx_vslli_b(reg0, 3);
+    reg2 = __lsx_vslli_b(reg2, 3);
+    tmpg = __lsx_vor_v(tmpg, reg0);
+    nexg = __lsx_vor_v(nexg, reg2);
+    reg1 = __lsx_vandi_b(tmp1, 0x7C);
+    reg3 = __lsx_vandi_b(tmp3, 0x7C);
+    tmpr = __lsx_vsrli_b(reg1, 2);
+    nexr = __lsx_vsrli_b(reg3, 2);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vslli_b(tmpg, 3);
+    reg2 = __lsx_vslli_b(tmpr, 3);
+    tmpb = __lsx_vsrli_b(tmpb, 2);
+    tmpg = __lsx_vsrli_b(tmpg, 2);
+    tmpr = __lsx_vsrli_b(tmpr, 2);
+    tmpb = __lsx_vor_v(reg0, tmpb);
+    tmpg = __lsx_vor_v(reg1, tmpg);
+    tmpr = __lsx_vor_v(reg2, tmpr);
+    reg0 = __lsx_vslli_b(nexb, 3);
+    reg1 = __lsx_vslli_b(nexg, 3);
+    reg2 = __lsx_vslli_b(nexr, 3);
+    nexb = __lsx_vsrli_b(nexb, 2);
+    nexg = __lsx_vsrli_b(nexg, 2);
+    nexr = __lsx_vsrli_b(nexr, 2);
+    nexb = __lsx_vor_v(reg0, nexb);
+    nexg = __lsx_vor_v(reg1, nexg);
+    nexr = __lsx_vor_v(reg2, nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_argb1555 += 32;
+    next_argb1555 += 32;
+  }
+}
+
+void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1;
+  __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+  __m128i reg0, reg1, dst0;
+  __m128i const_66 = __lsx_vldi(66);
+  __m128i const_129 = __lsx_vldi(129);
+  __m128i const_25 = __lsx_vldi(25);
+  __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_rgb565, 0);
+    src1 = __lsx_vld(src_rgb565, 16);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    tmpr = __lsx_vandi_b(tmp1, 0xF8);
+    reg1 = __lsx_vandi_b(tmp1, 0x07);
+    reg0 = __lsx_vsrli_b(tmp0, 5);
+    reg1 = __lsx_vslli_b(reg1, 3);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vsrli_b(tmpb, 2);
+    tmpb = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vslli_b(tmpg, 2);
+    reg1 = __lsx_vsrli_b(tmpg, 4);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    reg0 = __lsx_vsrli_b(tmpr, 5);
+    tmpr = __lsx_vor_v(tmpr, reg0);
+    reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+    reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+    reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+    reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+    reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+    reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+    dst0 = __lsx_vpackod_b(reg1, reg0);
+    __lsx_vst(dst0, dst_y, 0);
+    dst_y += 16;
+    src_rgb565 += 32;
+  }
+}
+
+void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  int len = width / 16;
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i reg0, reg1, reg2, reg3, dst0;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
+              next_rgb565, 16, src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+    tmpb = __lsx_vandi_b(tmp0, 0x1F);
+    tmpr = __lsx_vandi_b(tmp1, 0xF8);
+    nexb = __lsx_vandi_b(tmp2, 0x1F);
+    nexr = __lsx_vandi_b(tmp3, 0xF8);
+    reg1 = __lsx_vandi_b(tmp1, 0x07);
+    reg3 = __lsx_vandi_b(tmp3, 0x07);
+    reg0 = __lsx_vsrli_b(tmp0, 5);
+    reg1 = __lsx_vslli_b(reg1, 3);
+    reg2 = __lsx_vsrli_b(tmp2, 5);
+    reg3 = __lsx_vslli_b(reg3, 3);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    nexg = __lsx_vor_v(reg2, reg3);
+    reg0 = __lsx_vslli_b(tmpb, 3);
+    reg1 = __lsx_vsrli_b(tmpb, 2);
+    reg2 = __lsx_vslli_b(nexb, 3);
+    reg3 = __lsx_vsrli_b(nexb, 2);
+    tmpb = __lsx_vor_v(reg1, reg0);
+    nexb = __lsx_vor_v(reg2, reg3);
+    reg0 = __lsx_vslli_b(tmpg, 2);
+    reg1 = __lsx_vsrli_b(tmpg, 4);
+    reg2 = __lsx_vslli_b(nexg, 2);
+    reg3 = __lsx_vsrli_b(nexg, 4);
+    tmpg = __lsx_vor_v(reg1, reg0);
+    nexg = __lsx_vor_v(reg2, reg3);
+    reg0 = __lsx_vsrli_b(tmpr, 5);
+    reg2 = __lsx_vsrli_b(nexr, 5);
+    tmpr = __lsx_vor_v(tmpr, reg0);
+    nexr = __lsx_vor_v(nexr, reg2);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_rgb565 += 32;
+    next_rgb565 += 32;
+  }
+}
+
+void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
+                      int src_stride_rgb24,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+  int len = width / 16;
+  __m128i src0, src1, src2;
+  __m128i nex0, nex1, nex2, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
+  __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
+  __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+  __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+  __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
+  __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_rgb24, 0);
+    src1 = __lsx_vld(src_rgb24, 16);
+    src2 = __lsx_vld(src_rgb24, 32);
+    nex0 = __lsx_vld(next_rgb24, 0);
+    nex1 = __lsx_vld(next_rgb24, 16);
+    nex2 = __lsx_vld(next_rgb24, 32);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+              nexr);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+              nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_rgb24 += 48;
+    next_rgb24 += 48;
+  }
+}
+
+void RAWToUVRow_LSX(const uint8_t* src_raw,
+                    int src_stride_raw,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  const uint8_t* next_raw = src_raw + src_stride_raw;
+  int len = width / 16;
+  __m128i src0, src1, src2;
+  __m128i nex0, nex1, nex2, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
+  __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
+  __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+  __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+  __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
+  __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_raw, 0);
+    src1 = __lsx_vld(src_raw, 16);
+    src2 = __lsx_vld(src_raw, 32);
+    nex0 = __lsx_vld(next_raw, 0);
+    nex1 = __lsx_vld(next_raw, 16);
+    nex2 = __lsx_vld(next_raw, 32);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+              nexr);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+              nexb);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+              nexg);
+    DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+              nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_raw += 48;
+    next_raw += 48;
+  }
+}
+
+void NV12ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 8;
+  __m128i vec_y, vec_vu;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+  __m128i vec_vrub, vec_vgug;
+  __m128i out_b, out_g, out_r;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    vec_vu = __lsx_vld(src_uv, 0);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+             out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_y += 8;
+    src_uv += 8;
+  }
+}
+
+void NV12ToRGB565Row_LSX(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  int len = width / 8;
+  __m128i vec_y, vec_vu;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+  __m128i vec_vrub, vec_vgug;
+  __m128i out_b, out_g, out_r;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i zero = __lsx_vldi(0);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    vec_vu = __lsx_vld(src_uv, 0);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+             out_r);
+    out_b = __lsx_vsrli_h(out_b, 3);
+    out_g = __lsx_vsrli_h(out_g, 2);
+    out_r = __lsx_vsrli_h(out_r, 3);
+    out_g = __lsx_vslli_h(out_g, 5);
+    out_r = __lsx_vslli_h(out_r, 11);
+    out_r = __lsx_vor_v(out_r, out_g);
+    out_r = __lsx_vor_v(out_r, out_b);
+    __lsx_vst(out_r, dst_rgb565, 0);
+    src_y += 8;
+    src_uv += 8;
+    dst_rgb565 += 16;
+  }
+}
+
+void NV21ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 8;
+  __m128i vec_y, vec_uv;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+  __m128i vec_ubvr, vec_ugvg;
+  __m128i out_b, out_g, out_r;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    vec_uv = __lsx_vld(src_vu, 0);
+    YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
+             out_b);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_y += 8;
+    src_vu += 8;
+  }
+}
+
+void SobelRow_LSX(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, tmp0;
+  __m128i out0, out1, out2, out3;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
+  __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
+  __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
+  __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_sobelx, 0);
+    src1 = __lsx_vld(src_sobely, 0);
+    tmp0 = __lsx_vsadd_bu(src0, src1);
+    DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
+              tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
+    __lsx_vst(out0, dst_argb, 0);
+    __lsx_vst(out1, dst_argb, 16);
+    __lsx_vst(out2, dst_argb, 32);
+    __lsx_vst(out3, dst_argb, 48);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
+    DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
+    dst0 = __lsx_vsadd_bu(src0, src2);
+    dst1 = __lsx_vsadd_bu(src1, src3);
+    __lsx_vst(dst0, dst_y, 0);
+    __lsx_vst(dst1, dst_y, 16);
+    src_sobelx += 32;
+    src_sobely += 32;
+    dst_y += 32;
+  }
+}
+
+void SobelXYRow_LSX(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  int len = width / 16;
+  __m128i src_r, src_b, src_g;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i alpha = __lsx_vldi(0xFF);
+
+  for (x = 0; x < len; x++) {
+    src_r = __lsx_vld(src_sobelx, 0);
+    src_b = __lsx_vld(src_sobely, 0);
+    src_g = __lsx_vsadd_bu(src_r, src_b);
+    tmp0 = __lsx_vilvl_b(src_g, src_b);
+    tmp1 = __lsx_vilvh_b(src_g, src_b);
+    tmp2 = __lsx_vilvl_b(alpha, src_r);
+    tmp3 = __lsx_vilvh_b(alpha, src_r);
+    dst0 = __lsx_vilvl_h(tmp2, tmp0);
+    dst1 = __lsx_vilvh_h(tmp2, tmp0);
+    dst2 = __lsx_vilvl_h(tmp3, tmp1);
+    dst3 = __lsx_vilvh_h(tmp3, tmp1);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void BGRAToUVRow_LSX(const uint8_t* src_bgra,
+                     int src_stride_bgra,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* next_bgra = src_bgra + src_stride_bgra;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i nex0, nex1, nex2, nex3;
+  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
+              48, nex0, nex1, nex2, nex3);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vpickev_b(src1, src0);
+    tmp2 = __lsx_vpickod_b(src3, src2);
+    tmp3 = __lsx_vpickev_b(src3, src2);
+    tmpb = __lsx_vpickod_b(tmp2, tmp0);
+    tmpr = __lsx_vpickev_b(tmp2, tmp0);
+    tmpg = __lsx_vpickod_b(tmp3, tmp1);
+    tmp0 = __lsx_vpickod_b(nex1, nex0);
+    tmp1 = __lsx_vpickev_b(nex1, nex0);
+    tmp2 = __lsx_vpickod_b(nex3, nex2);
+    tmp3 = __lsx_vpickev_b(nex3, nex2);
+    nexb = __lsx_vpickod_b(tmp2, tmp0);
+    nexr = __lsx_vpickev_b(tmp2, tmp0);
+    nexg = __lsx_vpickod_b(tmp3, tmp1);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_bgra += 64;
+    next_bgra += 64;
+  }
+}
+
+void ABGRToUVRow_LSX(const uint8_t* src_abgr,
+                     int src_stride_abgr,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* next_abgr = src_abgr + src_stride_abgr;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i nex0, nex1, nex2, nex3;
+  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
+              48, nex0, nex1, nex2, nex3);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp2 = __lsx_vpickev_b(src3, src2);
+    tmp3 = __lsx_vpickod_b(src3, src2);
+    tmpb = __lsx_vpickod_b(tmp2, tmp0);
+    tmpr = __lsx_vpickev_b(tmp2, tmp0);
+    tmpg = __lsx_vpickev_b(tmp3, tmp1);
+    tmp0 = __lsx_vpickev_b(nex1, nex0);
+    tmp1 = __lsx_vpickod_b(nex1, nex0);
+    tmp2 = __lsx_vpickev_b(nex3, nex2);
+    tmp3 = __lsx_vpickod_b(nex3, nex2);
+    nexb = __lsx_vpickod_b(tmp2, tmp0);
+    nexr = __lsx_vpickev_b(tmp2, tmp0);
+    nexg = __lsx_vpickev_b(tmp3, tmp1);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_abgr += 64;
+    next_abgr += 64;
+  }
+}
+
+void RGBAToUVRow_LSX(const uint8_t* src_rgba,
+                     int src_stride_rgba,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* next_rgba = src_rgba + src_stride_rgba;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i nex0, nex1, nex2, nex3;
+  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
+              48, nex0, nex1, nex2, nex3);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vpickev_b(src1, src0);
+    tmp2 = __lsx_vpickod_b(src3, src2);
+    tmp3 = __lsx_vpickev_b(src3, src2);
+    tmpr = __lsx_vpickod_b(tmp2, tmp0);
+    tmpb = __lsx_vpickev_b(tmp2, tmp0);
+    tmpg = __lsx_vpickod_b(tmp3, tmp1);
+    tmp0 = __lsx_vpickod_b(nex1, nex0);
+    tmp1 = __lsx_vpickev_b(nex1, nex0);
+    tmp2 = __lsx_vpickod_b(nex3, nex2);
+    tmp3 = __lsx_vpickev_b(nex3, nex2);
+    nexr = __lsx_vpickod_b(tmp2, tmp0);
+    nexb = __lsx_vpickev_b(tmp2, tmp0);
+    nexg = __lsx_vpickod_b(tmp3, tmp1);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_rgba += 64;
+    next_rgba += 64;
+  }
+}
+
+void ARGBToUVJRow_LSX(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* next_argb = src_argb + src_stride_argb;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3;
+  __m128i nex0, nex1, nex2, nex3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, dst0;
+  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  __m128i const_63 = __lsx_vldi(0x43F);
+  __m128i const_42 = __lsx_vldi(0x42A);
+  __m128i const_21 = __lsx_vldi(0x415);
+  __m128i const_53 = __lsx_vldi(0x435);
+  __m128i const_10 = __lsx_vldi(0x40A);
+  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
+              48, nex0, nex1, nex2, nex3);
+    tmp0 = __lsx_vpickev_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src1, src0);
+    tmp2 = __lsx_vpickev_b(src3, src2);
+    tmp3 = __lsx_vpickod_b(src3, src2);
+    tmpr = __lsx_vpickod_b(tmp2, tmp0);
+    tmpb = __lsx_vpickev_b(tmp2, tmp0);
+    tmpg = __lsx_vpickev_b(tmp3, tmp1);
+    tmp0 = __lsx_vpickev_b(nex1, nex0);
+    tmp1 = __lsx_vpickod_b(nex1, nex0);
+    tmp2 = __lsx_vpickev_b(nex3, nex2);
+    tmp3 = __lsx_vpickod_b(nex3, nex2);
+    nexr = __lsx_vpickod_b(tmp2, tmp0);
+    nexb = __lsx_vpickev_b(tmp2, tmp0);
+    nexg = __lsx_vpickev_b(tmp3, tmp1);
+    tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
+    tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
+    tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
+    tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
+    reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
+    reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
+    tmpb = __lsx_vavgr_hu(tmp0, tmp1);
+    tmpg = __lsx_vavgr_hu(tmp2, tmp3);
+    tmpr = __lsx_vavgr_hu(reg0, reg1);
+    reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
+    reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
+    reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
+    reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
+    reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
+    reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
+    dst0 = __lsx_vpickod_b(reg1, reg0);
+    __lsx_vstelm_d(dst0, dst_u, 0, 0);
+    __lsx_vstelm_d(dst0, dst_v, 0, 1);
+    dst_u += 8;
+    dst_v += 8;
+    src_argb += 64;
+    next_argb += 64;
+  }
+}
+
+void I444ToARGBRow_LSX(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
+  __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    vec_u = __lsx_vld(src_u, 0);
+    vec_v = __lsx_vld(src_v, 0);
+    vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+    vec_ul = __lsx_vilvl_b(zero, vec_u);
+    vec_vl = __lsx_vilvl_b(zero, vec_v);
+    I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+              out_b, out_g, out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+    vec_uh = __lsx_vilvh_b(zero, vec_u);
+    vec_vh = __lsx_vilvh_b(zero, vec_v);
+    I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+              out_b, out_g, out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_y += 16;
+    src_u += 16;
+    src_v += 16;
+  }
+}
+
+void I400ToARGBRow_LSX(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_y, vec_yl, vec_yh, out0;
+  __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
+  __m128i temp0, temp1;
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
+  __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+    y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
+    y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
+    y_ev = __lsx_vsrai_w(y_ev, 16);
+    y_od = __lsx_vsrai_w(y_od, 16);
+    y_ev = __lsx_vadd_w(y_ev, vec_yb);
+    y_od = __lsx_vadd_w(y_od, vec_yb);
+    y_ev = __lsx_vsrai_w(y_ev, 6);
+    y_od = __lsx_vsrai_w(y_od, 6);
+    y_ev = __lsx_vclip255_w(y_ev);
+    y_od = __lsx_vclip255_w(y_od);
+    out0 = __lsx_vpackev_h(y_od, y_ev);
+    temp0 = __lsx_vpackev_b(out0, out0);
+    temp1 = __lsx_vpackev_b(alpha, out0);
+    dst0 = __lsx_vilvl_h(temp1, temp0);
+    dst1 = __lsx_vilvh_h(temp1, temp0);
+    vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+    y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
+    y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
+    y_ev = __lsx_vsrai_w(y_ev, 16);
+    y_od = __lsx_vsrai_w(y_od, 16);
+    y_ev = __lsx_vadd_w(y_ev, vec_yb);
+    y_od = __lsx_vadd_w(y_od, vec_yb);
+    y_ev = __lsx_vsrai_w(y_ev, 6);
+    y_od = __lsx_vsrai_w(y_od, 6);
+    y_ev = __lsx_vclip255_w(y_ev);
+    y_od = __lsx_vclip255_w(y_od);
+    out0 = __lsx_vpackev_h(y_od, y_ev);
+    temp0 = __lsx_vpackev_b(out0, out0);
+    temp1 = __lsx_vpackev_b(alpha, out0);
+    dst2 = __lsx_vilvl_h(temp1, temp0);
+    dst3 = __lsx_vilvh_h(temp1, temp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_y += 16;
+  }
+}
+
+void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  int len = width / 16;
+  __m128i vec_y, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i alpha = __lsx_vldi(0xFF);
+
+  for (x = 0; x < len; x++) {
+    vec_y = __lsx_vld(src_y, 0);
+    tmp0 = __lsx_vilvl_b(vec_y, vec_y);
+    tmp1 = __lsx_vilvh_b(vec_y, vec_y);
+    tmp2 = __lsx_vilvl_b(alpha, vec_y);
+    tmp3 = __lsx_vilvh_b(alpha, vec_y);
+    dst0 = __lsx_vilvl_h(tmp2, tmp0);
+    dst1 = __lsx_vilvh_h(tmp2, tmp0);
+    dst2 = __lsx_vilvl_h(tmp3, tmp1);
+    dst3 = __lsx_vilvh_h(tmp3, tmp1);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+    src_y += 16;
+  }
+}
+
+void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, vec_y, vec_vu;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+  __m128i vec_vrub, vec_vgug;
+  __m128i out_b, out_g, out_r;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i zero = __lsx_vldi(0);
+  __m128i alpha = __lsx_vldi(0xFF);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_yuy2, 0);
+    vec_y = __lsx_vpickev_b(src0, src0);
+    vec_vu = __lsx_vpickod_b(src0, src0);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+             out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_yuy2 += 16;
+  }
+}
+
+void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, vec_y, vec_vu;
+  __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+  __m128i vec_vrub, vec_vgug;
+  __m128i out_b, out_g, out_r;
+  __m128i const_80 = __lsx_vldi(0x480);
+  __m128i zero = __lsx_vldi(0);
+  __m128i alpha = __lsx_vldi(0xFF);
+
+  YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+  vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+  vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_uyvy, 0);
+    vec_y = __lsx_vpickod_b(src0, src0);
+    vec_vu = __lsx_vpickev_b(src0, src0);
+    YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+             out_r);
+    STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+    src_uyvy += 16;
+  }
+}
+
+void InterpolateRow_LSX(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int32_t source_y_fraction) {
+  int x;
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* nex_ptr = src_ptr + src_stride;
+  uint16_t y_fractions;
+  int len = width / 32;
+  __m128i src0, src1, nex0, nex1;
+  __m128i dst0, dst1, y_frac;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i const_128 = __lsx_vldi(0x480);
+
+  if (y1_fraction == 0) {
+    for (x = 0; x < len; x++) {
+      DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+      __lsx_vst(src0, dst_ptr, 0);
+      __lsx_vst(src1, dst_ptr, 16);
+      src_ptr += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  if (y1_fraction == 128) {
+    for (x = 0; x < len; x++) {
+      DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+      DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+      dst0 = __lsx_vavgr_bu(src0, nex0);
+      dst1 = __lsx_vavgr_bu(src1, nex1);
+      __lsx_vst(dst0, dst_ptr, 0);
+      __lsx_vst(dst1, dst_ptr, 16);
+      src_ptr += 32;
+      nex_ptr += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+  y_frac = __lsx_vreplgr2vr_h(y_fractions);
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+    tmp0 = __lsx_vilvl_b(nex0, src0);
+    tmp1 = __lsx_vilvh_b(nex0, src0);
+    tmp2 = __lsx_vilvl_b(nex1, src1);
+    tmp3 = __lsx_vilvh_b(nex1, src1);
+    tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
+    tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
+    tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
+    tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
+    dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
+    dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
+    __lsx_vst(dst0, dst_ptr, 0);
+    __lsx_vst(dst1, dst_ptr, 16);
+    src_ptr += 32;
+    nex_ptr += 32;
+    dst_ptr += 32;
+  }
+}
+
+void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
+  int x;
+  int len = width / 4;
+  __m128i dst0 = __lsx_vreplgr2vr_w(v32);
+
+  for (x = 0; x < len; x++) {
+    __lsx_vst(dst0, dst_argb, 0);
+    dst_argb += 16;
+  }
+}
+
+void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2;
+  __m128i dst0, dst1, dst2;
+  __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
+  __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
+  __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
+    src2 = __lsx_vld(src_raw, 32);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
+    dst2 = __lsx_vshuf_b(src1, src2, shuf2);
+    dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
+    __lsx_vst(dst0, dst_rgb24, 0);
+    __lsx_vst(dst1, dst_rgb24, 16);
+    __lsx_vst(dst2, dst_rgb24, 32);
+    dst_rgb24 += 48;
+    src_raw += 48;
+  }
+}
+
+void MergeUVRow_LSX(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
+    dst0 = __lsx_vilvl_b(src1, src0);
+    dst1 = __lsx_vilvh_b(src1, src0);
+    __lsx_vst(dst0, dst_uv, 0);
+    __lsx_vst(dst1, dst_uv, 16);
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+  }
+}
+
+void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vpickod_b(src1, src0);
+    tmp1 = __lsx_vpickod_b(src3, src2);
+    dst0 = __lsx_vpickod_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst_a, 0);
+    src_argb += 64;
+    dst_a += 16;
+  }
+}
+
+void ARGBBlendRow_LSX(const uint8_t* src_argb,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, dst0, dst1;
+  __m128i reg0, reg1, reg2, reg3;
+  __m128i a0, a1, a2, a3;
+  __m128i const_256 = __lsx_vldi(0x500);
+  __m128i zero = __lsx_vldi(0);
+  __m128i alpha = __lsx_vldi(0xFF);
+  __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
+              src0, src1, src2, src3);
+    tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
+    tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
+    a0 = __lsx_vilvl_b(zero, tmp0);
+    a1 = __lsx_vilvh_b(zero, tmp0);
+    a2 = __lsx_vilvl_b(zero, tmp1);
+    a3 = __lsx_vilvh_b(zero, tmp1);
+    reg0 = __lsx_vilvl_b(zero, src2);
+    reg1 = __lsx_vilvh_b(zero, src2);
+    reg2 = __lsx_vilvl_b(zero, src3);
+    reg3 = __lsx_vilvh_b(zero, src3);
+    DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
+              const_256, a3, a0, a1, a2, a3);
+    DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
+              reg2, reg3);
+    DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
+    dst0 = __lsx_vsadd_bu(dst0, src0);
+    dst1 = __lsx_vsadd_bu(dst1, src1);
+    dst0 = __lsx_vbitsel_v(dst0, alpha, control);
+    dst1 = __lsx_vbitsel_v(dst1, alpha, control);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width) {
+  int x;
+  int len = width / 16;
+  __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
+  __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
+  __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
+  __m128i zero = __lsx_vldi(0);
+  __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
+              src0, src1, src2, src3);
+    reg0 = __lsx_vilvl_b(zero, src0);
+    reg1 = __lsx_vilvh_b(zero, src0);
+    reg2 = __lsx_vilvl_b(zero, src1);
+    reg3 = __lsx_vilvh_b(zero, src1);
+    reg4 = __lsx_vilvl_b(zero, src2);
+    reg5 = __lsx_vilvh_b(zero, src2);
+    reg6 = __lsx_vilvl_b(zero, src3);
+    reg7 = __lsx_vilvh_b(zero, src3);
+    tmp0 = __lsx_vilvl_h(zero, reg0);
+    tmp1 = __lsx_vilvh_h(zero, reg0);
+    tmp2 = __lsx_vilvl_h(zero, reg1);
+    tmp3 = __lsx_vilvh_h(zero, reg1);
+    tmp4 = __lsx_vilvl_h(zero, reg2);
+    tmp5 = __lsx_vilvh_h(zero, reg2);
+    tmp6 = __lsx_vilvl_h(zero, reg3);
+    tmp7 = __lsx_vilvh_h(zero, reg3);
+    DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+              tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+              tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+              tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+    dst0 = __lsx_vpickev_b(reg1, reg0);
+    dst1 = __lsx_vpickev_b(reg3, reg2);
+    tmp0 = __lsx_vilvl_h(zero, reg4);
+    tmp1 = __lsx_vilvh_h(zero, reg4);
+    tmp2 = __lsx_vilvl_h(zero, reg5);
+    tmp3 = __lsx_vilvh_h(zero, reg5);
+    tmp4 = __lsx_vilvl_h(zero, reg6);
+    tmp5 = __lsx_vilvh_h(zero, reg6);
+    tmp6 = __lsx_vilvl_h(zero, reg7);
+    tmp7 = __lsx_vilvh_h(zero, reg7);
+    DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+              tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+              tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+              tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+    dst2 = __lsx_vpickev_b(reg1, reg0);
+    dst3 = __lsx_vpickev_b(reg3, reg2);
+    DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
+              dst3, vec_size, dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
+              vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
+    DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
+              src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    __lsx_vst(dst2, dst_argb, 32);
+    __lsx_vst(dst3, dst_argb, 48);
+    dst_argb += 64;
+  }
+}
+
+void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width) {
+  int x;
+  int len = width / 8;
+  __m128i src0, src1, tmp0, tmp1, dst0, dst1;
+  __m128i tmp_b, tmp_g, tmp_r, tmp_a;
+  __m128i reg_b, reg_g, reg_r, reg_a;
+  __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
+  __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
+  __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
+  __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
+              src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
+              src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
+    DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
+              tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+    DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
+              reg_a, reg_b, reg_g, reg_r, reg_a);
+    DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
+              tmp_g, tmp_r, tmp_a);
+    DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
+              reg_g, reg_r, reg_a);
+    DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
+              tmp_a)
+    DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
+              reg_a)
+    DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
+              tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+    tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
+    tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
+    dst0 = __lsx_vilvl_h(tmp1, tmp0);
+    dst1 = __lsx_vilvh_h(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void SplitUVRow_LSX(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
+    DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
+    __lsx_vst(dst0, dst_u, 0);
+    __lsx_vst(dst1, dst_u, 16);
+    __lsx_vst(dst2, dst_v, 0);
+    __lsx_vst(dst3, dst_v, 16);
+    src_uv += 64;
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
+  int x;
+  int len = width / 16;
+  __m128i dst0 = __lsx_vreplgr2vr_b(v8);
+
+  for (x = 0; x < len; x++) {
+    __lsx_vst(dst0, dst, 0);
+    dst += 16;
+  }
+}
+
+void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  int x;
+  int len = width / 32;
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
+  __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
+
+  src_uv += (width << 1);
+  for (x = 0; x < len; x++) {
+    src_uv -= 64;
+    DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
+              src3, src0, src1);
+    DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
+              shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
+    __lsx_vst(dst0, dst_v, 0);
+    __lsx_vst(dst1, dst_v, 16);
+    __lsx_vst(dst2, dst_u, 0);
+    __lsx_vst(dst3, dst_u, 16);
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void HalfFloatRow_LSX(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width) {
+  int x;
+  int len = width / 32;
+  float mult = 1.9259299444e-34f * scale;
+  __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
+  __m128i zero = __lsx_vldi(0);
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
+              reg6);
+    DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
+              reg7);
+    DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
+              reg3, vec_mult, reg0, reg1, reg2, reg3);
+    DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
+              reg7, vec_mult, reg4, reg5, reg6, reg7);
+    DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
+              (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
+              (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+              dst0, dst1, dst2, dst3);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    __lsx_vst(dst3, dst, 48);
+    src += 32;
+    dst += 32;
+  }
+}
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // ARGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr10, $vr0          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr11, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr10, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "1:                                         \n\t"
+      "vld            $vr4,  %0,    0             \n\t"
+      "vld            $vr5,  %0,    16            \n\t"
+      "vld            $vr6,  %0,    32            \n\t"
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // RGBA
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
+      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
+      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr11, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  // G
+      "vmaddwod.h.bu  $vr13, $vr10, $vr1          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr11, $vr2          \n\t"
+      "addi.d         %0,    %0,    64            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants)
+      : "memory");
+}
+
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
+                                uint8_t* dst_y,
+                                int width,
+                                const struct RgbConstants* rgbconstants) {
+  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
+                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
+                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
+                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
+                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
+      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
+      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
+      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
+      "vld            $vr4,  %4,    0             \n\t"  // load shuff
+      "vld            $vr5,  %4,    16            \n\t"
+      "vld            $vr6,  %4,    32            \n\t"
+      "vld            $vr7,  %4,    48            \n\t"
+      "1:                                         \n\t"
+      "vld            $vr8,  %0,    0             \n\t"
+      "vld            $vr9,  %0,    16            \n\t"
+      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
+                                                         // RGB
+      "vor.v          $vr12, $vr3,  $vr3          \n\t"
+      "vor.v          $vr13, $vr3,  $vr3          \n\t"
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
+      "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
+      "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
+      "vshuf.b        $vr17, $vr9,  $vr10, $vr7   \n\t"
+      "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  // G
+      "vmaddwev.h.bu  $vr13, $vr17, $vr1          \n\t"
+      "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  // B
+      "vmaddwev.h.bu  $vr13, $vr15, $vr0          \n\t"
+      "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  // R
+      "vmaddwod.h.bu  $vr13, $vr15, $vr2          \n\t"
+      "addi.d         %0,    %0,    48            \n\t"
+      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
+      "vst            $vr10, %1,    0             \n\t"
+      "addi.d         %1,    %1,    16            \n\t"
+      "bnez           %2,    1b                   \n\t"
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(rgbconstants),  // %3
+        "r"(shuff)          // %4
+      : "memory");
+}
+
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/row_msa.cc b/source/row_msa.cc
index 5c0239a3..b7d5bb5e 100644
--- a/files/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -24,16 +24,14 @@ extern "C" {
 #define ALPHA_VAL (-1)
 
 // Fill YUV -> RGB conversion constants into vectors
-#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
-  {                                                              \
-    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
-    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
-    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
-    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
-    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
-    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
-    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
-    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+  {                                                      \
+    ub = __msa_fill_w(yuvconst->kUVToB[0]);              \
+    vr = __msa_fill_w(yuvconst->kUVToR[1]);              \
+    ug = __msa_fill_w(yuvconst->kUVToG[0]);              \
+    vg = __msa_fill_w(yuvconst->kUVToG[1]);              \
+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);             \
+    yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]);         \
   }
 
 // Load YUV 422 pixel data
@@ -70,54 +68,52 @@ extern "C" {
   }
 
 // Convert 8 pixels of YUV 420 to RGB.
-#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
-  {                                                                            \
-    v8i16 vec0_m, vec1_m;                                                      \
-    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
-    v4i32 reg5_m, reg6_m, reg7_m;                                              \
-    v16i8 zero_m = {0};                                                        \
-                                                                               \
-    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
-    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
-    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
-    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
-    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
-    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
-    reg0_m *= yg;                                                              \
-    reg1_m *= yg;                                                              \
-    reg2_m *= ubvr;                                                            \
-    reg3_m *= ubvr;                                                            \
-    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
-    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
-    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
-    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
-    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
-    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
-    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
-    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
-    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
-    reg5_m = reg0_m - reg5_m;                                                  \
-    reg6_m = reg1_m - reg6_m;                                                  \
-    reg2_m = reg0_m - reg2_m;                                                  \
-    reg3_m = reg1_m - reg3_m;                                                  \
-    reg7_m = reg0_m - reg7_m;                                                  \
-    reg4_m = reg1_m - reg4_m;                                                  \
-    reg5_m += bb;                                                              \
-    reg6_m += bb;                                                              \
-    reg7_m += bg;                                                              \
-    reg4_m += bg;                                                              \
-    reg2_m += br;                                                              \
-    reg3_m += br;                                                              \
-    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
-    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
-    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
-    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
-    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
-    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
-    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
-    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
-    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
-    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+  {                                                                    \
+    v8i16 vec0_m, vec1_m;                                              \
+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                      \
+    v4i32 reg5_m, reg6_m, reg7_m;                                      \
+    v16i8 temp_m, zero_m = {0};                                        \
+                                                                       \
+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);            \
+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);         \
+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);        \
+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);        \
+    vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80);                  \
+    temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0);                         \
+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m);        \
+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m);        \
+    reg0_m *= yg;                                                      \
+    reg1_m *= yg;                                                      \
+    reg2_m *= ubvr;                                                    \
+    reg3_m *= ubvr;                                                    \
+    reg0_m = __msa_srai_w(reg0_m, 16);                                 \
+    reg1_m = __msa_srai_w(reg1_m, 16);                                 \
+    reg0_m += yb;                                                      \
+    reg1_m += yb;                                                      \
+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);               \
+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                            \
+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                            \
+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                             \
+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                            \
+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                            \
+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                             \
+    reg5_m = reg0_m + reg5_m;                                          \
+    reg6_m = reg1_m + reg6_m;                                          \
+    reg2_m = reg0_m + reg2_m;                                          \
+    reg3_m = reg1_m + reg3_m;                                          \
+    reg7_m = reg0_m - reg7_m;                                          \
+    reg4_m = reg1_m - reg4_m;                                          \
+    reg5_m = __msa_srai_w(reg5_m, 6);                                  \
+    reg6_m = __msa_srai_w(reg6_m, 6);                                  \
+    reg7_m = __msa_srai_w(reg7_m, 6);                                  \
+    reg4_m = __msa_srai_w(reg4_m, 6);                                  \
+    reg2_m = __msa_srai_w(reg2_m, 6);                                  \
+    reg3_m = __msa_srai_w(reg3_m, 6);                                  \
+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);       \
+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);               \
+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);               \
+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);               \
   }
 
 // Pack and Store 8 ARGB values.
@@ -155,11 +151,10 @@ extern "C" {
   }
 
 // Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
   {                                                                       \
     v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
     v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
-    v16u8 vec8_m, vec9_m;                                                 \
     v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
     v8u16 reg8_m, reg9_m;                                                 \
                                                                           \
@@ -195,81 +190,81 @@ extern "C" {
     reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
     reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
     reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
-    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
-    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
-    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
-    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
-    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
-    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
-    src0_m = (v16u8)__msa_ld_b((void*)s, 64);                             \
-    src1_m = (v16u8)__msa_ld_b((void*)s, 80);                             \
-    src2_m = (v16u8)__msa_ld_b((void*)s, 96);                             \
-    src3_m = (v16u8)__msa_ld_b((void*)s, 112);                            \
-    src4_m = (v16u8)__msa_ld_b((void*)t, 64);                             \
-    src5_m = (v16u8)__msa_ld_b((void*)t, 80);                             \
-    src6_m = (v16u8)__msa_ld_b((void*)t, 96);                             \
-    src7_m = (v16u8)__msa_ld_b((void*)t, 112);                            \
-    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
-    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
-    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
-    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
-    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
-    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
-    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
-    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
-    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
-    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
-    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
-    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
-    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
-    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
-    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
-    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
-    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
-    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
-    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
-    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
-    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
-    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
-    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
-    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
-    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
-    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
-    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    reg8_m += const_0x0101;                                               \
+    reg9_m += const_0x0101;                                               \
+    reg0_m += const_0x0101;                                               \
+    reg1_m += const_0x0101;                                               \
+    argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1);                        \
+    argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1);                        \
+    argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1);                        \
+    argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1);                        \
   }
 
-// Takes ARGB input and calculates U and V.
 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
-                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+                 shf0, shf1, shf2, shf3, shift, u_out, v_out)                \
   {                                                                          \
-    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
                                                                              \
-    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
-    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
-    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
-    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
-    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
-    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
-    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
-    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
-    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
-    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
-    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
-    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
-    reg0_m += const3;                                                        \
-    reg1_m += const3;                                                        \
-    reg2_m += const3;                                                        \
-    reg3_m += const3;                                                        \
-    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
-    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
-    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
-    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
-    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
-    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+    vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_w(vec0_m, const0);                                 \
+    reg1_m = __msa_dotp_u_w(vec1_m, const0);                                 \
+    reg2_m = __msa_dotp_u_w(vec4_m, const0);                                 \
+    reg3_m = __msa_dotp_u_w(vec5_m, const0);                                 \
+    reg0_m += const1;                                                        \
+    reg1_m += const1;                                                        \
+    reg2_m += const1;                                                        \
+    reg3_m += const1;                                                        \
+    reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2);                         \
+    reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2);                         \
+    reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3);                         \
+    reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3);                         \
+    reg0_m = __msa_srl_w(reg0_m, shift);                                     \
+    reg1_m = __msa_srl_w(reg1_m, shift);                                     \
+    reg2_m = __msa_srl_w(reg2_m, shift);                                     \
+    reg3_m = __msa_srl_w(reg3_m, shift);                                     \
+    u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);              \
+    v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);              \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                   shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                            \
+    v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;      \
+    v4u32 reg0_m, reg1_m, reg2_m, reg3_m;                                      \
+                                                                               \
+    vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0);                   \
+    vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2);                   \
+    vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0);                   \
+    vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2);                   \
+    vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0);                   \
+    vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2);                   \
+    vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0);                   \
+    vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2);                   \
+    reg0_m = __msa_dotp_u_w(vec0_m, const1);                                   \
+    reg1_m = __msa_dotp_u_w(vec1_m, const1);                                   \
+    reg2_m = __msa_dotp_u_w(vec4_m, const1);                                   \
+    reg3_m = __msa_dotp_u_w(vec5_m, const1);                                   \
+    reg0_m += (v4u32)const3;                                                   \
+    reg1_m += (v4u32)const3;                                                   \
+    reg2_m += (v4u32)const3;                                                   \
+    reg3_m += (v4u32)const3;                                                   \
+    reg0_m -= __msa_dotp_u_w(vec2_m, const0);                                  \
+    reg1_m -= __msa_dotp_u_w(vec3_m, const0);                                  \
+    reg2_m -= __msa_dotp_u_w(vec6_m, const2);                                  \
+    reg3_m -= __msa_dotp_u_w(vec7_m, const2);                                  \
+    u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                \
+    v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m);                \
+    u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out);                  \
+    v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out);                  \
   }
 
 // Load I444 pixel data
@@ -285,6 +280,34 @@ extern "C" {
     out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
   }
 
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+  {                                                              \
+    v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5;              \
+    v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5;              \
+    _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb);                  \
+    _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb);                  \
+    _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg);                  \
+    _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg);                  \
+    _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr);                  \
+    _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr);                  \
+    _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0);                 \
+    _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1);                 \
+    _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2);                 \
+    _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3);                 \
+    _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4);                 \
+    _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5);                 \
+    _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1);                 \
+    _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3);                 \
+    _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5);                 \
+    _reg1 = const_8080 + const_112 * _reg0;                      \
+    _reg3 = const_8080 + const_112 * _reg4;                      \
+    _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2);        \
+    _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2);        \
+    _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4);        \
+    _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0);        \
+    _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1);                  \
+  }
+
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   v16u8 src0, src1, src2, src3;
@@ -302,6 +325,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  v8u16 src, dst;
+  v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+  src_uv += (width - 8) << 1;
+  for (x = 0; x < width; x += 8) {
+    src = LD_UH(src_uv);
+    dst = __msa_vshf_h(shuffler, src, src);
+    ST_UH(dst, dst_uv);
+    src_uv -= 16;
+    dst_uv += 16;
+  }
+}
+
 void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   v16u8 src0, src1, src2, src3;
@@ -376,20 +413,19 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
   int x;
   v16u8 src0, src1, src2;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
   for (x = 0; x < width; x += 8) {
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
     src_y += 8;
     src_u += 4;
@@ -407,20 +443,19 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
   int x;
   v16u8 src0, src1, src2;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
   for (x = 0; x < width; x += 8) {
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
     src_y += 8;
     src_u += 4;
@@ -440,12 +475,12 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
   int64_t data_a;
   v16u8 src0, src1, src2, src3;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v4i32 zero = {0};
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -454,8 +489,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
     src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
     STOREARGB(vec0, vec1, vec2, src3, dst_argb);
     src_y += 8;
@@ -476,17 +510,17 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
   int64_t data_u, data_v;
   v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 reg0, reg1, reg2, reg3;
   v2i64 zero = {0};
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
   v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
   v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
   v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
                      11, 29, 12, 13, 30, 14, 15, 31};
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -499,10 +533,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
     src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
     src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec3, vec4, vec5);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5);
     reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
     reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
     reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
@@ -529,24 +561,23 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
   int x;
   v16u8 src0, src1, src2, dst0;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
   for (x = 0; x < width; x += 8) {
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec2, vec1);
-    vec0 = __msa_srai_h(vec0, 3);
-    vec1 = __msa_srai_h(vec1, 3);
-    vec2 = __msa_srai_h(vec2, 2);
-    vec1 = __msa_slli_h(vec1, 11);
-    vec2 = __msa_slli_h(vec2, 5);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+    vec0 = __msa_srli_h(vec0, 3);
+    vec1 = __msa_srli_h(vec1, 2);
+    vec2 = __msa_srli_h(vec2, 3);
+    vec2 = __msa_slli_h(vec2, 11);
+    vec1 = __msa_slli_h(vec1, 5);
     vec0 |= vec1;
     dst0 = (v16u8)(vec2 | vec0);
     ST_UB(dst0, dst_rgb565);
@@ -568,25 +599,24 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
   v16u8 src0, src1, src2, dst0;
   v8i16 vec0, vec1, vec2;
   v8u16 reg0, reg1, reg2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+  v8u16 mask = (v8u16)__msa_fill_h(0x00F0);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
   for (x = 0; x < width; x += 8) {
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    reg0 = (v8u16)__msa_srai_h(vec0, 4);
-    reg1 = (v8u16)__msa_srai_h(vec1, 4);
-    reg2 = (v8u16)__msa_srai_h(vec2, 4);
-    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
-    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srli_h(vec0, 4);
+    reg2 = (v8u16)__msa_srli_h(vec2, 4);
+    reg1 = (v8u16)__msa_and_v(vec1, mask);
+    reg2 = (v8u16)__msa_slli_h(reg2, 8);
     reg1 |= const_0xF000;
     reg0 |= reg2;
     dst0 = (v16u8)(reg1 | reg0);
@@ -608,23 +638,22 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
   v16u8 src0, src1, src2, dst0;
   v8i16 vec0, vec1, vec2;
   v8u16 reg0, reg1, reg2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
   for (x = 0; x < width; x += 8) {
     READYUV422(src_y, src_u, src_v, src0, src1, src2);
     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
-    reg0 = (v8u16)__msa_srai_h(vec0, 3);
-    reg1 = (v8u16)__msa_srai_h(vec1, 3);
-    reg2 = (v8u16)__msa_srai_h(vec2, 3);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srli_h(vec0, 3);
+    reg1 = (v8u16)__msa_srli_h(vec1, 3);
+    reg2 = (v8u16)__msa_srli_h(vec2, 3);
     reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
     reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
     reg1 |= const_0x8000;
@@ -768,7 +797,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
   }
 }
 
-void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -779,10 +808,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@@ -809,38 +838,39 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+  const uint8_t* src_argb_next = src_argb + src_stride_argb;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
   v16u8 dst0, dst1;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
   for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -861,14 +891,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     reg3 = __msa_hadd_u_h(vec5, vec5);
     reg4 = __msa_hadd_u_h(vec0, vec0);
     reg5 = __msa_hadd_u_h(vec1, vec1);
-    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
-    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
-    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
-    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
-    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
-    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
-    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
-    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -889,12 +919,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     reg3 += __msa_hadd_u_h(vec5, vec5);
     reg4 += __msa_hadd_u_h(vec0, vec0);
     reg5 += __msa_hadd_u_h(vec1, vec1);
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
-    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
-    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
-    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg4 += const_0x0001;
+    reg5 += const_0x0001;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
     reg6 = reg0 * const_0x70;
     reg7 = reg1 * const_0x70;
     reg8 = reg2 * const_0x4A;
@@ -925,8 +961,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
     dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
     ST_UB(dst0, dst_u);
     ST_UB(dst1, dst_v);
-    src_argb0 += 128;
-    src_argb0_next += 128;
+    src_argb += 128;
+    src_argb_next += 128;
     dst_u += 16;
     dst_v += 16;
   }
@@ -1153,7 +1189,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
   }
 }
 
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -1164,7 +1200,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
   v8i16 zero = {0};
 
   for (x = 0; x < width; x += 4) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
     src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
     vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
@@ -1186,13 +1222,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_argb);
-    src_argb0 += 16;
+    src_argb += 16;
     src_argb1 += 16;
     dst_argb += 16;
   }
 }
 
-void ARGBAddRow_MSA(const uint8_t* src_argb0,
+void ARGBAddRow_MSA(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
@@ -1200,20 +1236,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0,
   v16u8 src0, src1, src2, src3, dst0, dst1;
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     dst0 = __msa_adds_u_b(src0, src2);
     dst1 = __msa_adds_u_b(src1, src3);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
 }
 
-void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
@@ -1221,14 +1257,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
   v16u8 src0, src1, src2, src3, dst0, dst1;
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     dst0 = __msa_subs_u_b(src0, src2);
     dst1 = __msa_subs_u_b(src1, src3);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
@@ -1412,17 +1448,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int x;
   v16u8 src0, src1, vec0, vec1, dst0, dst1;
   v8u16 reg0;
-  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
-  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
 
   for (x = 0; x < width; x += 8) {
     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
     vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
     vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
-    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
-    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
-    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
     vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
     vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -1656,56 +1692,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
                         uint8_t* dst_y,
                         int width) {
   int x;
-  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v16u8 dst0;
-  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
-  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
-  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+  v16u8 reg0, reg1, reg2, dst;
+  v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+  v8i16 res0, res1;
+  v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+  v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+  v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+  v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
+  v16u8 zero = (v16u8)__msa_ldi_b(0);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
-    src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    vec2 = src0 & const_0x1F;
-    vec3 = src1 & const_0x1F;
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
-    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
-    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
-    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
-    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
-    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
-    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
-    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
-    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
-    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
-    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
-    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
-    reg0 *= const_0x19;
-    reg1 *= const_0x19;
-    reg2 *= const_0x81;
-    reg3 *= const_0x81;
-    reg4 *= const_0x42;
-    reg5 *= const_0x42;
-    reg0 += reg2;
-    reg1 += reg3;
-    reg0 += reg4;
-    reg1 += reg5;
-    reg0 += const_0x1080;
-    reg1 += const_0x1080;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
-    ST_UB(dst0, dst_y);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16);
+    tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+    tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+    tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+    tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+    reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+    reg0 = (v16u8)__msa_slli_b(reg0, 3);
+    tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+    reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+    tmpr = (v16u8)__msa_srli_b(reg1, 2);
+    reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+    reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+    reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+    tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+    tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+    tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+    tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+    tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+    tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+    tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+    tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+    tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+    tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+    tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+    tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+    res0 = const_1080 + const_25 * tmpb_r;
+    res1 = const_1080 + const_25 * tmpb_l;
+    res0 += const_129 * tmpg_r;
+    res1 += const_129 * tmpg_l;
+    res0 += const_66 * tmpr_r;
+    res1 += const_66 * tmpr_l;
+    dst = (v16u8)__msa_pckod_b(res1, res0);
+    ST_UB(dst, dst_y);
     src_argb1555 += 32;
     dst_y += 16;
   }
@@ -1713,68 +1744,55 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
 
 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
-  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
-  v4u32 res0, res1, res2, res3;
-  v16u8 dst0;
-  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
-  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
-  v8i16 const_0x1080 = __msa_fill_h(0x1080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
-  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+  v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+  v16u8 reg0, reg1, dst;
+  v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+  v8i16 res0, res1;
+  v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+  v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+  v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+  v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
+  v16u8 zero = __msa_ldi_b(0);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
-    src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src0 & const_0x7E0;
-    vec2 = src0 & const_0xF800;
-    vec3 = src1 & const_0x1F;
-    vec4 = src1 & const_0x7E0;
-    vec5 = src1 & const_0xF800;
-    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
-    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
-    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
-    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
-    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
-    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
-    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
-    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
-    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
-    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
-    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
-    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
-    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
-    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
-    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
-    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
-    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
-    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
-    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
-    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
-    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
-    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
-    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
-    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
-    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
-    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
-    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
-    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
-    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
-    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
-    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
-    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    ST_UB(dst0, dst_y);
+    src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16);
+    tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+    tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+    tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+    tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+    reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+    reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+    reg1 = (v16u8)__msa_slli_b(reg1, 3);
+    tmpg = (v16u8)__msa_or_v(reg1, reg0);
+    reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+    reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+    tmpb = (v16u8)__msa_or_v(reg1, reg0);
+    reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+    reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+    tmpg = (v16u8)__msa_or_v(reg1, reg0);
+    reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+    tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+    tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+    tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+    tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+    tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+    tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+    tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+    res0 = const_1080 + const_25 * tmpb_r;
+    res1 = const_1080 + const_25 * tmpb_l;
+    res0 += const_129 * tmpg_r;
+    res1 += const_129 * tmpg_l;
+    res0 += const_66 * tmpr_r;
+    res1 += const_66 * tmpr_l;
+    dst = (v16u8)__msa_pckod_b(res1, res0);
+    ST_UB(dst, dst_y);
     src_rgb565 += 32;
     dst_y += 16;
   }
 }
 
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   v8u16 vec0, vec1, vec2, vec3;
@@ -1789,9 +1807,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1810,12 +1828,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 48;
+    src_argb += 48;
     dst_y += 16;
   }
 }
 
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   v8u16 vec0, vec1, vec2, vec3;
@@ -1830,9 +1848,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1851,7 +1869,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 48;
+    src_argb += 48;
     dst_y += 16;
   }
 }
@@ -1865,69 +1883,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
   const uint16_t* s = (const uint16_t*)src_argb1555;
   const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
   int64_t res0, res1;
-  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v16u8 src0, src1, src2, src3, dst;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 reg0, reg1, reg2, reg3;
+  v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+  v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+  v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+  v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+  v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+  v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
 
   for (x = 0; x < width; x += 16) {
     src0 = (v8u16)__msa_ld_b((void*)s, 0);
     src1 = (v8u16)__msa_ld_b((void*)s, 16);
     src2 = (v8u16)__msa_ld_b((void*)t, 0);
     src3 = (v8u16)__msa_ld_b((void*)t, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    vec0 += src2 & const_0x1F;
-    vec1 += src3 & const_0x1F;
-    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec2 = src0 & const_0x1F;
-    vec3 = src1 & const_0x1F;
-    vec2 += src2 & const_0x1F;
-    vec3 += src3 & const_0x1F;
-    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    vec4 += src2 & const_0x1F;
-    vec5 += src3 & const_0x1F;
-    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
-    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
-    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
-    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
-    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
-    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
-    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
-    reg0 = vec6 * const_0x70;
-    reg1 = vec0 * const_0x4A;
-    reg2 = vec2 * const_0x70;
-    reg3 = vec0 * const_0x5E;
-    reg0 += const_0x8080;
-    reg1 += vec2 * const_0x26;
-    reg2 += const_0x8080;
-    reg3 += vec6 * const_0x12;
-    reg0 -= reg1;
-    reg2 -= reg3;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+    tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+    tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+    tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+    tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+    nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+    tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+    nexg = (v16u8)__msa_srli_b(tmp2, 5);
+    reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+    reg2 = (v16u8)__msa_andi_b(tmp3, 0x03);
+    reg0 = (v16u8)__msa_slli_b(reg0, 3);
+    reg2 = (v16u8)__msa_slli_b(reg2, 3);
+    tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+    nexg = (v16u8)__msa_or_v(nexg, reg2);
+    reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+    reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C);
+    tmpr = (v16u8)__msa_srli_b(reg1, 2);
+    nexr = (v16u8)__msa_srli_b(reg3, 2);
+    reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+    reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+    reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+    tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+    tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+    tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+    tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+    tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+    tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+    reg0 = (v16u8)__msa_slli_b(nexb, 3);
+    reg1 = (v16u8)__msa_slli_b(nexg, 3);
+    reg2 = (v16u8)__msa_slli_b(nexr, 3);
+    nexb = (v16u8)__msa_srli_b(nexb, 2);
+    nexg = (v16u8)__msa_srli_b(nexg, 2);
+    nexr = (v16u8)__msa_srli_b(nexr, 2);
+    nexb = (v16u8)__msa_or_v(reg0, nexb);
+    nexg = (v16u8)__msa_or_v(reg1, nexg);
+    nexr = (v16u8)__msa_or_v(reg2, nexr);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+    res0 = __msa_copy_u_d((v2i64)dst, 0);
+    res1 = __msa_copy_u_d((v2i64)dst, 1);
     SD(res0, dst_u);
     SD(res1, dst_v);
     s += 16;
@@ -1946,68 +1956,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
   const uint16_t* s = (const uint16_t*)src_rgb565;
   const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
   int64_t res0, res1;
-  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
-  v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
-  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
-  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
-  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+  v16u8 src0, src1, src2, src3, dst;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 reg0, reg1, reg2, reg3;
+  v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+  v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+  v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+  v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+  v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+  v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+  v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v8u16)__msa_ld_b((void*)s, 0);
-    src1 = (v8u16)__msa_ld_b((void*)s, 16);
-    src2 = (v8u16)__msa_ld_b((void*)t, 0);
-    src3 = (v8u16)__msa_ld_b((void*)t, 16);
-    vec0 = src0 & const_0x1F;
-    vec1 = src1 & const_0x1F;
-    vec0 += src2 & const_0x1F;
-    vec1 += src3 & const_0x1F;
-    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
-    vec2 = src0 & const_0x3F;
-    vec3 = src1 & const_0x3F;
-    vec2 += src2 & const_0x3F;
-    vec3 += src3 & const_0x3F;
-    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
-    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
-    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
-    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
-    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
-    vec4 = src0 & const_0x1F;
-    vec5 = src1 & const_0x1F;
-    vec4 += src2 & const_0x1F;
-    vec5 += src3 & const_0x1F;
-    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
-    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
-    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
-    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
-    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
-    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
-    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
-    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
-    reg0 = vec3 * const_0x70;
-    reg1 = vec1 * const_0x4A;
-    reg2 = vec4 * const_0x70;
-    reg3 = vec1 * const_0x5E;
-    reg0 += const_32896;
-    reg1 += vec4 * const_0x26;
-    reg2 += const_32896;
-    reg3 += vec3 * const_0x12;
-    reg0 -= reg1;
-    reg2 -= reg3;
-    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
-    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
-    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
-    res0 = __msa_copy_u_d((v2i64)dst0, 0);
-    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    src0 = (v16u8)__msa_ld_b((void*)s, 0);
+    src1 = (v16u8)__msa_ld_b((void*)s, 16);
+    src2 = (v16u8)__msa_ld_b((void*)t, 0);
+    src3 = (v16u8)__msa_ld_b((void*)t, 16);
+    tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+    tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+    tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+    tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+    tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+    tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+    nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+    nexr = (v16u8)__msa_andi_b(tmp3, 0xF8);
+    reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+    reg3 = (v16u8)__msa_andi_b(tmp3, 0x07);
+    reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+    reg1 = (v16u8)__msa_slli_b(reg1, 3);
+    reg2 = (v16u8)__msa_srli_b(tmp2, 5);
+    reg3 = (v16u8)__msa_slli_b(reg3, 3);
+    tmpg = (v16u8)__msa_or_v(reg1, reg0);
+    nexg = (v16u8)__msa_or_v(reg2, reg3);
+    reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+    reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+    reg2 = (v16u8)__msa_slli_b(nexb, 3);
+    reg3 = (v16u8)__msa_srli_b(nexb, 2);
+    tmpb = (v16u8)__msa_or_v(reg1, reg0);
+    nexb = (v16u8)__msa_or_v(reg2, reg3);
+    reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+    reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+    reg2 = (v16u8)__msa_slli_b(nexg, 2);
+    reg3 = (v16u8)__msa_srli_b(nexg, 4);
+    tmpg = (v16u8)__msa_or_v(reg1, reg0);
+    nexg = (v16u8)__msa_or_v(reg2, reg3);
+    reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+    reg2 = (v16u8)__msa_srli_b(nexr, 5);
+    tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+    nexr = (v16u8)__msa_or_v(nexr, reg2);
+    RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+    res0 = __msa_copy_u_d((v2i64)dst, 0);
+    res1 = __msa_copy_u_d((v2i64)dst, 1);
     SD(res0, dst_u);
     SD(res1, dst_v);
     s += 16;
@@ -2017,26 +2016,27 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
   }
 }
 
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   int64_t res0, res1;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8i16 reg0, reg1, reg2, reg3;
   v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   v16i8 zero = {0};
 
@@ -2085,10 +2085,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 = __msa_srai_h((v8i16)reg0, 2);
-    reg1 = __msa_srai_h((v8i16)reg1, 2);
-    reg2 = __msa_srai_h((v8i16)reg2, 2);
-    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg0 = __msa_srai_h((v8i16)reg0, 1);
+    reg1 = __msa_srai_h((v8i16)reg1, 1);
+    reg2 = __msa_srai_h((v8i16)reg2, 1);
+    reg3 = __msa_srai_h((v8i16)reg3, 1);
     vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
     vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
     vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2122,26 +2126,27 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
   int64_t res0, res1;
   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8i16 reg0, reg1, reg2, reg3;
   v16u8 dst0;
-  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
-  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
-  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
-  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
-  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   v16i8 zero = {0};
 
@@ -2190,10 +2195,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
-    reg0 = __msa_srai_h(reg0, 2);
-    reg1 = __msa_srai_h(reg1, 2);
-    reg2 = __msa_srai_h(reg2, 2);
-    reg3 = __msa_srai_h(reg3, 2);
+    reg0 += const_0x0001;
+    reg1 += const_0x0001;
+    reg2 += const_0x0001;
+    reg3 += const_0x0001;
+    reg0 = __msa_srai_h(reg0, 1);
+    reg1 = __msa_srai_h(reg1, 1);
+    reg2 = __msa_srai_h(reg2, 1);
+    reg3 = __msa_srai_h(reg3, 1);
     vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
     vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
     vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2236,13 +2245,13 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
   uint64_t val0, val1;
   v16u8 src0, src1, res0, res1, dst0, dst1;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 zero = {0};
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -2251,8 +2260,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
     val1 = LD(src_uv);
     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
     res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
     dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2273,12 +2281,12 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
   uint64_t val0, val1;
   v16u8 src0, src1, dst0;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
   v16u8 zero = {0};
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -2287,8 +2295,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
     val1 = LD(src_uv);
     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     vec0 = vec0 >> 3;
     vec1 = (vec1 >> 2) << 5;
     vec2 = (vec2 >> 3) << 11;
@@ -2309,14 +2316,14 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
   uint64_t val0, val1;
   v16u8 src0, src1, res0, res1, dst0, dst1;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   v16u8 zero = {0};
   v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -2326,8 +2333,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
     src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
-    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
     res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
     dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2416,27 +2422,27 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
   }
 }
 
-void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
-  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
-  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
-  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+  v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+  v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+  v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
-    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@@ -2444,19 +2450,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@@ -2464,19 +2470,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   int x;
   v16u8 src0, src1, src2, src3, dst0;
   v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@@ -2484,81 +2490,143 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
 
   for (x = 0; x < width; x += 16) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
-    src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
-    src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
     ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
             dst0);
     ST_UB(dst0, dst_y);
-    src_argb0 += 64;
+    src_argb += 64;
     dst_y += 16;
   }
 }
 
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 vec0, vec1, vec2, vec3;
-  v16u8 dst0, dst1;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
-  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
-  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
-  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
+  v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 dst0, dst1, dst2, dst3;
+  v16u8 zero = {0};
+  v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+  v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+  v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+  v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+  v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+  v4i32 shift = __msa_fill_w(0x00000008);
 
   for (x = 0; x < width; x += 32) {
-    src0 = (v16u8)__msa_ld_b((void*)s, 0);
-    src1 = (v16u8)__msa_ld_b((void*)s, 16);
-    src2 = (v16u8)__msa_ld_b((void*)s, 32);
-    src3 = (v16u8)__msa_ld_b((void*)s, 48);
-    src4 = (v16u8)__msa_ld_b((void*)t, 0);
-    src5 = (v16u8)__msa_ld_b((void*)t, 16);
-    src6 = (v16u8)__msa_ld_b((void*)t, 32);
-    src7 = (v16u8)__msa_ld_b((void*)t, 48);
-    src0 = __msa_aver_u_b(src0, src4);
-    src1 = __msa_aver_u_b(src1, src5);
-    src2 = __msa_aver_u_b(src2, src6);
-    src3 = __msa_aver_u_b(src3, src7);
-    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
-    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
-    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
-    vec0 = __msa_aver_u_b(src4, src6);
-    vec1 = __msa_aver_u_b(src5, src7);
-    src0 = (v16u8)__msa_ld_b((void*)s, 64);
-    src1 = (v16u8)__msa_ld_b((void*)s, 80);
-    src2 = (v16u8)__msa_ld_b((void*)s, 96);
-    src3 = (v16u8)__msa_ld_b((void*)s, 112);
-    src4 = (v16u8)__msa_ld_b((void*)t, 64);
-    src5 = (v16u8)__msa_ld_b((void*)t, 80);
-    src6 = (v16u8)__msa_ld_b((void*)t, 96);
-    src7 = (v16u8)__msa_ld_b((void*)t, 112);
-    src0 = __msa_aver_u_b(src0, src4);
-    src1 = __msa_aver_u_b(src1, src5);
-    src2 = __msa_aver_u_b(src2, src6);
-    src3 = __msa_aver_u_b(src3, src7);
-    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
-    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
-    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
-    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
-    vec2 = __msa_aver_u_b(src4, src6);
-    vec3 = __msa_aver_u_b(src5, src7);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
-             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_v);
-    ST_UB(dst1, dst_u);
+    src1 = __msa_ld_b((void*)s, 0);
+    src3 = __msa_ld_b((void*)s, 16);
+    src5 = __msa_ld_b((void*)t, 0);
+    src7 = __msa_ld_b((void*)t, 16);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec0 = __msa_aver_u_h(src4, src5);
+    vec1 = __msa_aver_u_h(src6, src7);
+
+    src1 = __msa_ld_b((void*)s, 32);
+    src3 = __msa_ld_b((void*)s, 48);
+    src5 = __msa_ld_b((void*)t, 32);
+    src7 = __msa_ld_b((void*)t, 48);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec2 = __msa_aver_u_h(src4, src5);
+    vec3 = __msa_aver_u_h(src6, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+             shuffler2, shuffler3, shift, dst0, dst1);
+
+    src1 = __msa_ld_b((void*)s, 64);
+    src3 = __msa_ld_b((void*)s, 80);
+    src5 = __msa_ld_b((void*)t, 64);
+    src7 = __msa_ld_b((void*)t, 80);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec0 = __msa_aver_u_h(src4, src5);
+    vec1 = __msa_aver_u_h(src6, src7);
+
+    src1 = __msa_ld_b((void*)s, 96);
+    src3 = __msa_ld_b((void*)s, 112);
+    src5 = __msa_ld_b((void*)t, 96);
+    src7 = __msa_ld_b((void*)t, 112);
+    src0 = __msa_ilvr_b(zero, src1);
+    src1 = __msa_ilvl_b(zero, src1);
+    src2 = __msa_ilvr_b(zero, src3);
+    src3 = __msa_ilvl_b(zero, src3);
+    src4 = __msa_ilvr_b(zero, src5);
+    src5 = __msa_ilvl_b(zero, src5);
+    src6 = __msa_ilvr_b(zero, src7);
+    src7 = __msa_ilvl_b(zero, src7);
+    src0 += src4;
+    src1 += src5;
+    src2 += src6;
+    src3 += src7;
+    src4 = __msa_ilvev_d(src1, src0);
+    src5 = __msa_ilvod_d(src1, src0);
+    src6 = __msa_ilvev_d(src3, src2);
+    src7 = __msa_ilvod_d(src3, src2);
+    vec2 = __msa_aver_u_h(src4, src5);
+    vec3 = __msa_aver_u_h(src6, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+             const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+             shuffler2, shuffler3, shift, dst2, dst3);
+
+    dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+    dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
     s += 128;
     t += 128;
     dst_v += 16;
@@ -2566,103 +2634,108 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
   }
 }
 
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
-  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
-  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+  v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+  v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+  v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
-             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_v);
-    ST_UB(dst1, dst_u);
-    s += 128;
-    t += 128;
-    dst_v += 16;
-    dst_u += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 src0, src1, src2, src3;
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
   v16u8 dst0, dst1;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
-  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
-  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+  v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+  v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+  v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, src0, src1, src2, src3);
-    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
-             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    s += 128;
-    t += 128;
-    dst_u += 16;
-    dst_v += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
   int x;
-  const uint8_t* s = src_rgb0;
-  const uint8_t* t = src_rgb0 + src_stride_rgb;
-  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
-  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
-  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
-                     18, 19, 22, 23, 26, 27, 30, 31};
-  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
-  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
-  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
-  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
-  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
-  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  const uint8_t* s = src_rgb;
+  const uint8_t* t = src_rgb + src_stride_rgb;
+  const uint8_t unused = 0xf;
+  v8u16 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+  v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+  v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+  v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+  v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+  v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+  v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+  v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+  v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
 
-  for (x = 0; x < width; x += 32) {
-    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
-    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
-             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
-             dst1);
-    ST_UB(dst0, dst_u);
-    ST_UB(dst1, dst_v);
-    s += 128;
-    t += 128;
-    dst_u += 16;
-    dst_v += 16;
+  for (x = 0; x < width; x += 16) {
+    READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+    ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+               const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+               shuffler3, dst0, dst1);
+    *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+    *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+    s += 64;
+    t += 64;
+    dst_u += 8;
+    dst_v += 8;
   }
 }
 
@@ -2674,54 +2747,57 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
                        int width) {
   int x;
   v16u8 src0, src1, src2, dst0, dst1;
-  v8u16 vec0, vec1, vec2;
+  v8i16 vec0, vec1, vec2;
   v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   v8i16 zero = {0};
+  v4i32 const_0x80 = __msa_fill_w(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
 
   for (x = 0; x < width; x += 8) {
     READI444(src_y, src_u, src_v, src0, src1, src2);
-    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
     reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
     reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
     reg0 *= vec_yg;
     reg1 *= vec_yg;
     reg0 = __msa_srai_w(reg0, 16);
     reg1 = __msa_srai_w(reg1, 16);
-    reg4 = reg0 + vec_br;
-    reg5 = reg1 + vec_br;
-    reg2 = reg0 + vec_bg;
-    reg3 = reg1 + vec_bg;
-    reg0 += vec_bb;
-    reg1 += vec_bb;
+    reg0 += vec_yb;
+    reg1 += vec_yb;
     vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
     vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
     reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
     reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
     reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
     reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
-    reg0 -= reg6 * vec_ub;
-    reg1 -= reg7 * vec_ub;
-    reg2 -= reg6 * vec_ug;
-    reg3 -= reg7 * vec_ug;
-    reg4 -= reg8 * vec_vr;
-    reg5 -= reg9 * vec_vr;
-    reg2 -= reg8 * vec_vg;
-    reg3 -= reg9 * vec_vg;
-    reg0 = __msa_srai_w(reg0, 6);
-    reg1 = __msa_srai_w(reg1, 6);
-    reg2 = __msa_srai_w(reg2, 6);
-    reg3 = __msa_srai_w(reg3, 6);
-    reg4 = __msa_srai_w(reg4, 6);
-    reg5 = __msa_srai_w(reg5, 6);
+    reg6 -= const_0x80;
+    reg7 -= const_0x80;
+    reg8 -= const_0x80;
+    reg9 -= const_0x80;
+    tmp0 = reg0 + reg6 * vec_ub;
+    tmp1 = reg1 + reg7 * vec_ub;
+    tmp2 = reg0 + reg8 * vec_vr;
+    tmp3 = reg1 + reg9 * vec_vr;
+    tmp4 = reg6 * vec_ug;
+    tmp5 = reg7 * vec_ug;
+    tmp4 += reg8 * vec_vg;
+    tmp5 += reg9 * vec_vg;
+    tmp4 = reg0 - tmp4;
+    tmp5 = reg1 - tmp5;
+    reg0 = __msa_srai_w(tmp0, 6);
+    reg1 = __msa_srai_w(tmp1, 6);
+    reg2 = __msa_srai_w(tmp2, 6);
+    reg3 = __msa_srai_w(tmp3, 6);
+    reg4 = __msa_srai_w(tmp4, 6);
+    reg5 = __msa_srai_w(tmp5, 6);
     CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
-    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
-    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
     vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
     vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
     dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
@@ -2734,13 +2810,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
   }
 }
 
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
   int x;
+#if defined(__aarch64__) || defined(__arm__)
+  int ygb = yuvconstants->kUVBiasBGR[3];
+  int yg = yuvconstants->kYToRgb[1];
+#else
+  int ygb = yuvconstants->kYBiasToRgb[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
   v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
   v8i16 vec0, vec1;
   v4i32 reg0, reg1, reg2, reg3;
-  v4i32 vec_yg = __msa_fill_w(0x4A35);
-  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v4i32 vec_yg = __msa_fill_w(yg);
+  v8i16 vec_ygb = __msa_fill_h(ygb);
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   v8i16 max = __msa_ldi_h(0xFF);
   v8i16 zero = {0};
@@ -2814,12 +2901,12 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
   int x;
   v16u8 src0, src1, src2;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -2827,8 +2914,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
     src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
     src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
     src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
-    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
     src_yuy2 += 16;
     dst_argb += 32;
@@ -2842,12 +2928,12 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
   int x;
   v16u8 src0, src1, src2;
   v8i16 vec0, vec1, vec2;
-  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
   v4i32 vec_ubvr, vec_ugvg;
+  v8i16 const_0x80 = __msa_ldi_h(0x80);
   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
 
-  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
-                 vec_br, vec_yg);
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
 
@@ -2855,8 +2941,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
     src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
     src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
     src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
-    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
-             vec0, vec1, vec2);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
     STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
     src_uyvy += 16;
     dst_argb += 32;
@@ -3001,12 +3086,12 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
   }
 }
 
-void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+void ARGBBlendRow_MSA(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width) {
   int x;
-  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
   v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3015,8 +3100,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
   v16i8 zero = {0};
 
   for (x = 0; x < width; x += 8) {
-    src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
-    src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+    src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
     src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
     vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
@@ -3051,16 +3136,16 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
     vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
     vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
     vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
-    vec0 += vec8;
-    vec1 += vec9;
-    vec2 += vec10;
-    vec3 += vec11;
     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
     dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+    dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+    dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
     dst0 = __msa_bmnz_v(dst0, const_255, mask);
     dst1 = __msa_bmnz_v(dst1, const_255, mask);
     ST_UB2(dst0, dst1, dst_argb, 16);
-    src_argb0 += 32;
+    src_argb += 32;
     src_argb1 += 32;
     dst_argb += 32;
   }
@@ -3082,7 +3167,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
   v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
   v16i8 zero = {0};
 
-  for (x = 0; x < width; x += 8) {
+  for (x = 0; x < width; x += 16) {
     src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
     src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
     src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3400,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
   }
 }
 
-void MirrorUVRow_MSA(const uint8_t* src_uv,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
   int x;
   v16u8 src0, src1, src2, src3;
   v16u8 dst0, dst1, dst2, dst3;
diff --git a/source/row_neon.cc b/source/row_neon.cc
new file mode 100644
index 00000000..31142a90
--- /dev/null
+++ b/source/row_neon.cc
@@ -0,0 +1,3999 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are
+// reserved.
+
+// q0: Y uint16x8_t
+// d2: U uint8x8_t
+// d3: V uint8x8_t
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                               \
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vld1.32    {d2[0]}, [%[src_u]]!           \n" \
+  "vld1.32    {d2[1]}, [%[src_v]]!           \n" \
+  "vmov.u8    d1, d0                         \n" \
+  "vmovl.u8   q1, d2                         \n" \
+  "vzip.u8    d0, d1                         \n" \
+  "vsli.u16   q1, q1, #8                     \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                               \
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vld1.8     {d2}, [%[src_u]]!              \n" \
+  "vmovl.u8   q0, d0                         \n" \
+  "vld1.8     {d3}, [%[src_v]]!              \n" \
+  "vsli.u16   q0, q0, #8                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                               \
+  "vld1.8     {d0}, [%[src_y]]!              \n" \
+  "vmov.u8    q1, #128                       \n" \
+  "vmovl.u8   q0, d0                         \n" \
+  "vsli.u16   q0, q0, #8                     \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                              \
+  "vld1.8     {d0}, [%[src_y]]!              \n"                              \
+  "vld1.8     {d2}, [%[src_uv]]!             \n"                              \
+  "vmov.u8    d1, d0                         \n"                              \
+  "vmov.u8    d3, d2                         \n"                              \
+  "vzip.u8    d0, d1                         \n"                              \
+  "vsli.u16   d2, d2, #8                     \n" /* Duplicate low byte (U) */ \
+  "vsri.u16   d3, d3, #8                     \n" /* Duplicate high byte (V) */
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+  "vld1.8     {d0}, [%[src_y]]!              \n"                               \
+  "vld1.8     {d2}, [%[src_vu]]!             \n"                               \
+  "vmov.u8    d1, d0                         \n"                               \
+  "vmov.u8    d3, d2                         \n"                               \
+  "vzip.u8    d0, d1                         \n"                               \
+  "vsri.u16   d2, d2, #8                     \n" /* Duplicate high byte (U) */ \
+  "vsli.u16   d3, d3, #8                     \n" /* Duplicate low byte (V) */
+
+// Read 8 YUY2
+#define READYUY2                                 \
+  "vld2.8     {d0, d2}, [%[src_yuy2]]!       \n" \
+  "vmovl.u8   q0, d0                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vsli.u16   q0, q0, #8                     \n" \
+  "vsli.u16   d2, d2, #8                     \n" \
+  "vsri.u16   d3, d3, #8                     \n"
+
+// Read 8 UYVY
+#define READUYVY                                 \
+  "vld2.8     {d2, d3}, [%[src_uyvy]]!       \n" \
+  "vmovl.u8   q0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vsli.u16   q0, q0, #8                     \n" \
+  "vsli.u16   d2, d2, #8                     \n" \
+  "vsri.u16   d3, d3, #8                     \n"
+
+// TODO: Use single register for kUVCoeff and multiply by lane
+#define YUVTORGB_SETUP                                        \
+  "vld1.16    {d31}, [%[kRGBCoeffBias]]                   \n" \
+  "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
+  "vdup.u16   q10, d31[1]                                 \n" \
+  "vdup.u16   q11, d31[2]                                 \n" \
+  "vdup.u16   q12, d31[3]                                 \n" \
+  "vdup.u16   d31, d31[0]                                 \n"
+
+// q0: B uint16x8_t
+// q1: G uint16x8_t
+// q2: R uint16x8_t
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB                                           \
+  "vmull.u16  q2, d1, d31                    \n"           \
+  "vmull.u8   q8, d3, d29                    \n" /* DGV */ \
+  "vmull.u16  q0, d0, d31                    \n"           \
+  "vmlal.u8   q8, d2, d28                    \n" /* DG */  \
+  "vqshrn.u32 d0, q0, #16                    \n"           \
+  "vqshrn.u32 d1, q2, #16                    \n" /* Y */   \
+  "vmull.u8   q9, d2, d26                    \n" /* DB */  \
+  "vmull.u8   q2, d3, d27                    \n" /* DR */  \
+  "vadd.u16   q4, q0, q11                    \n" /* G */   \
+  "vadd.u16   q2, q0, q2                     \n" /* R */   \
+  "vadd.u16   q0, q0, q9                     \n" /* B */   \
+  "vqsub.u16  q1, q4, q8                     \n" /* G */   \
+  "vqsub.u16  q0, q0, q10                    \n" /* B */   \
+  "vqsub.u16  q2, q2, q12                    \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8                                        \
+  "vqshrn.u16 d4, q2, #6                     \n" /* R */ \
+  "vqshrn.u16 d2, q1, #6                     \n" /* G */ \
+  "vqshrn.u16 d0, q0, #6                     \n" /* B */
+
+#define YUVTORGB_REGS \
+  "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
+
+#define STORERGBA                                \
+  "vmov.u8    d1, d0                         \n" \
+  "vmov.u8    d3, d4                         \n" \
+  "vmov.u8    d0, d6                         \n" \
+  "vst4.8     {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "vld1.8      {d6}, [%[src_a]]!             \n"
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "vld1.8      {d6}, [%[src_a]]!             \n"
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      RGBTORGB8 "subs        %[width], %[width], #8        \n" STORERGBA
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
+  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
+  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
+  "vsri.16     q2, q1, #5                    \n" /* RG                   */ \
+  "vsri.16     q2, q0, #11                   \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      RGBTORGB8 "subs        %[width], %[width], #8        \n" ARGBTORGB565
+      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q3, d6, #8                    \n" /* A                    */ \
+  "vshll.u8    q2, d4, #8                    \n" /* R                    */ \
+  "vshll.u8    q1, d2, #8                    \n" /* G                    */ \
+  "vshll.u8    q0, d0, #8                    \n" /* B                    */ \
+  "vsri.16     q3, q2, #1                    \n" /* AR                   */ \
+  "vsri.16     q3, q1, #6                    \n" /* ARG                  */ \
+  "vsri.16     q3, q0, #11                   \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vmov.u8     d6, #0xff                     \n" ARGBTOARGB1555
+      "vst1.8      {q3}, [%[dst_argb1555]]!      \n"  // store 8 pixels RGB1555.
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "q3");
+}
+
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d0, d0, #4                     \n" /* B                    */ \
+  "vbic.32    d2, d2, d7                     \n" /* G                    */ \
+  "vshr.u8    d4, d4, #4                     \n" /* R                    */ \
+  "vbic.32    d6, d6, d7                     \n" /* A                    */ \
+  "vorr       d0, d0, d2                     \n" /* BG                   */ \
+  "vorr       d1, d4, d6                     \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n" ARGBTOARGB4444
+      "vst1.8      {q0}, [%[dst_argb4444]]!      \n"  // store 8 pixels
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "q3");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+          RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d23, #255                     \n"
+      "1:                                        \n"
+      "vld1.8      {d20}, [%0]!                  \n"
+      "vmov        d21, d20                      \n"
+      "vmov        d22, d20                      \n"
+      "subs        %2, %2, #8                    \n"
+      "vst4.8      {d20, d21, d22, d23}, [%1]!   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_vu] "+r"(src_vu),                             // %[src_vu]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst3.8      {d0, d2, d4}, [%[dst_rgb24]]! \n"
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_vu] "+r"(src_vu),                             // %[src_vu]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n" ARGBTORGB565
+      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_uv] "+r"(src_uv),                             // %[src_uv]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_yuy2] "+r"(src_yuy2),                         // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8     d6, #255                      \n"
+      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
+      "subs        %[width], %[width], #8        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+      "bgt         1b                            \n"
+      : [src_uyvy] "+r"(src_uyvy),                         // %[src_uyvy]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store U
+      "vst1.8      {q1}, [%2]!                   \n"  // store V
+      "bgt         1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "pld         [%0, #1792]                   \n"
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 bytes
+      "bgt         1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "q0"  // Clobber List
+  );
+}
+
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "pld         [%0, #3584]                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
+      "bgt         1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], %4            \n"
+      "subs        %3, %3, #16                   \n"
+      "pld         [%0, #1792]                   \n"
+      "vst1.8      {d0}, [%1]!                   \n"
+      "vst1.8      {d1}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3
+      : "r"(src_tile_stride)        // %4
+      : "cc", "memory", "d0", "d1"  // Clobber List
+  );
+}
+
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "pld         [%0, #1792]                   \n"
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "pld         [%1, #1792]                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst2.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUYV.
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
+      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
+      "subs        %3, %3, #16                   \n"
+      "pld         [%0, #1792]                   \n"
+      "vzip.8      q0, q1                        \n"
+      "pld         [%1, #1792]                   \n"
+      "vst1.8      {q0, q1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber list
+  );
+}
+#endif
+
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
+      "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
+                                                      // by row.
+      "vld1.8      {q11}, [%0]!                  \n"
+      "vld1.8      {q13}, [%0]!                  \n"
+      "vld1.8      {q15}, [%0]!                  \n"
+      "vshl.u8     q8, q14, #6                   \n"  // Shift lower bit data
+                                                      // appropriately.
+      "vshl.u8     q10, q14, #4                  \n"
+      "vshl.u8     q12, q14, #2                  \n"
+      "vzip.u8     q8, q9                        \n"  // Interleave upper and
+                                                      // lower bits.
+      "vzip.u8     q10, q11                      \n"
+      "vzip.u8     q12, q13                      \n"
+      "vzip.u8     q14, q15                      \n"
+      "vsri.u16    q8, q8, #10                   \n"  // Copy upper 6 bits
+                                                      // into lower 6 bits for
+                                                      // better accuracy in
+                                                      // conversions.
+      "vsri.u16    q9, q9, #10                   \n"
+      "vsri.u16    q10, q10, #10                 \n"
+      "vsri.u16    q11, q11, #10                 \n"
+      "vsri.u16    q12, q12, #10                 \n"
+      "vsri.u16    q13, q13, #10                 \n"
+      "vsri.u16    q14, q14, #10                 \n"
+      "vsri.u16    q15, q15, #10                 \n"
+      "vstmia      %1!, {q8-q15}                 \n"  // Store pixel block (64
+                                                      // pixels).
+      "subs        %2, %2, #80                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load U
+      "vld1.8      {q1}, [%1]!                   \n"  // load V
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vst2.8      {q0, q1}, [%2]!               \n"  // store 16 pairs of UV
+      "bgt         1b                            \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%1]!                   \n"  // store R
+      "vst1.8      {q1}, [%2]!                   \n"  // store G
+      "vst1.8      {q2}, [%3]!                   \n"  // store B
+      "bgt         1b                            \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load R
+      "vld1.8      {q1}, [%1]!                   \n"  // load G
+      "vld1.8      {q2}, [%2]!                   \n"  // load B
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst3.8      {d0, d2, d4}, [%3]!           \n"  // store 8 RGB
+      "vst3.8      {d1, d3, d5}, [%3]!           \n"  // next 8 RGB
+      "bgt         1b                            \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
+      "subs        %5, %5, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%3]!                   \n"  // store B
+      "vst1.8      {q1}, [%2]!                   \n"  // store G
+      "vst1.8      {q2}, [%1]!                   \n"  // store R
+      "vst1.8      {q3}, [%4]!                   \n"  // store A
+      "bgt         1b                            \n"
+      : "+r"(src_argb),                         // %0
+        "+r"(dst_r),                            // %1
+        "+r"(dst_g),                            // %2
+        "+r"(dst_b),                            // %3
+        "+r"(dst_a),                            // %4
+        "+r"(width)                             // %5
+      :                                         // Input registers
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q2}, [%0]!                   \n"  // load R
+      "vld1.8      {q1}, [%1]!                   \n"  // load G
+      "vld1.8      {q0}, [%2]!                   \n"  // load B
+      "vld1.8      {q3}, [%3]!                   \n"  // load A
+      "subs        %5, %5, #16                   \n"  // 16 processed per loop
+      "vst4.8      {d0, d2, d4, d6}, [%4]!       \n"  // store 8 ARGB
+      "vst4.8      {d1, d3, d5, d7}, [%4]!       \n"  // next 8 ARGB
+      "bgt         1b                            \n"
+      : "+r"(src_r),                            // %0
+        "+r"(src_g),                            // %1
+        "+r"(src_b),                            // %2
+        "+r"(src_a),                            // %3
+        "+r"(dst_argb),                         // %4
+        "+r"(width)                             // %5
+      :                                         // Input registers
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q0}, [%3]!                   \n"  // store B
+      "vst1.8      {q1}, [%2]!                   \n"  // store G
+      "vst1.8      {q2}, [%1]!                   \n"  // store R
+      "bgt         1b                            \n"
+      : "+r"(src_argb),                         // %0
+        "+r"(dst_r),                            // %1
+        "+r"(dst_g),                            // %2
+        "+r"(dst_b),                            // %3
+        "+r"(width)                             // %4
+      :                                         // Input registers
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "vmov.u8     q3, #255                      \n"  // load A(255)
+      "1:                                        \n"
+      "vld1.8      {q2}, [%0]!                   \n"  // load R
+      "vld1.8      {q1}, [%1]!                   \n"  // load G
+      "vld1.8      {q0}, [%2]!                   \n"  // load B
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop
+      "vst4.8      {d0, d2, d4, d6}, [%3]!       \n"  // store 8 ARGB
+      "vst4.8      {d1, d3, d5, d7}, [%3]!       \n"  // next 8 ARGB
+      "bgt         1b                            \n"
+      : "+r"(src_r),                            // %0
+        "+r"(src_g),                            // %1
+        "+r"(src_b),                            // %2
+        "+r"(dst_argb),                         // %3
+        "+r"(width)                             // %4
+      :                                         // Input registers
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = 10 - depth;
+  asm volatile(
+      "vmov.u32    q14, #1023                    \n"
+      "vdup.32     q15, %5                       \n"
+      "1:                                        \n"
+      "vld1.16     {d4}, [%2]!                   \n"  // B
+      "vld1.16     {d2}, [%1]!                   \n"  // G
+      "vld1.16     {d0}, [%0]!                   \n"  // R
+      "vmovl.u16   q2, d4                        \n"  // B
+      "vmovl.u16   q1, d2                        \n"  // G
+      "vmovl.u16   q0, d0                        \n"  // R
+      "vshl.u32    q2, q2, q15                   \n"  // 000B
+      "vshl.u32    q1, q1, q15                   \n"
+      "vshl.u32    q0, q0, q15                   \n"
+      "vmin.u32    q2, q2, q14                   \n"
+      "vmin.u32    q1, q1, q14                   \n"
+      "vmin.u32    q0, q0, q14                   \n"
+      "vsli.u32    q2, q1, #10                   \n"  // 00GB
+      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
+      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
+      "subs        %4, %4, #4                    \n"
+      "vst1.8      {q2}, [%3]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width) {
+  asm volatile(
+      "vmov.u32    q14, #1023                    \n"
+      "1:                                        \n"
+      "vld1.16     {d4}, [%2]!                   \n"  // B
+      "vld1.16     {d2}, [%1]!                   \n"  // G
+      "vld1.16     {d0}, [%0]!                   \n"  // R
+      "vmovl.u16   q2, d4                        \n"  // 000B
+      "vmovl.u16   q1, d2                        \n"  // G
+      "vmovl.u16   q0, d0                        \n"  // R
+      "vmin.u32    q2, q2, q14                   \n"
+      "vmin.u32    q1, q1, q14                   \n"
+      "vmin.u32    q0, q0, q14                   \n"
+      "vsli.u32    q2, q1, #10                   \n"  // 00GB
+      "vsli.u32    q2, q0, #20                   \n"  // 0RGB
+      "vorr.u32    q2, #0xc0000000               \n"  // ARGB (AR30)
+      "subs        %4, %4, #4                    \n"
+      "vst1.8      {q2}, [%3]!                   \n"
+      "bgt         1b                            \n"
+      "3:                                        \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q14");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "vdup.u16    q15, %6                       \n"
+      "vdup.u16    q14, %7                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vld1.16     {q3}, [%3]!                   \n"  // A
+      "vmin.u16    q2, q2, q14                   \n"
+      "vmin.u16    q1, q1, q14                   \n"
+      "vmin.u16    q0, q0, q14                   \n"
+      "vmin.u16    q3, q3, q14                   \n"
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vshl.u16    q3, q3, q15                   \n"
+      "subs        %5, %5, #8                    \n"
+      "vst4.16     {d0, d2, d4, d6}, [%4]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%4]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+        "+r"(width)      // %5
+      : "r"(shift),      // %6
+        "r"(mask)        // %7
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
+      "vdup.u16    q15, %5                       \n"
+      "vdup.u16    q14, %6                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vmin.u16    q2, q2, q14                   \n"
+      "vmin.u16    q1, q1, q14                   \n"
+      "vmin.u16    q0, q0, q14                   \n"
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.16     {d0, d2, d4, d6}, [%3]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%3]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar64),  // %3
+        "+r"(width)      // %4
+      : "r"(shift),      // %5
+        "r"(mask)        // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "vdup.16     q15, %6                       \n"
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vld1.16     {q3}, [%3]!                   \n"  // A
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vshl.u16    q3, q3, q15                   \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d1, q1                        \n"
+      "vqmovn.u16  d2, q2                        \n"
+      "vqmovn.u16  d3, q3                        \n"
+      "subs        %5, %5, #8                    \n"
+      "vst4.8      {d0, d1, d2, d3}, [%4]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : "r"(shift)       // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "vdup.16     q15, %5                       \n"
+      "vmov.u8     d6, #0xff                     \n"  // A (0xff)
+      "1:                                        \n"
+      "vld1.16     {q2}, [%0]!                   \n"  // R
+      "vld1.16     {q1}, [%1]!                   \n"  // G
+      "vld1.16     {q0}, [%2]!                   \n"  // B
+      "vshl.u16    q2, q2, q15                   \n"
+      "vshl.u16    q1, q1, q15                   \n"
+      "vshl.u16    q0, q0, q15                   \n"
+      "vqmovn.u16  d5, q2                        \n"
+      "vqmovn.u16  d4, q1                        \n"
+      "vqmovn.u16  d3, q0                        \n"
+      "subs        %4, %4, #8                    \n"
+      "vst4.u8     {d3, d4, d5, d6}, [%3]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
+      "subs        %2, %2, #32                   \n"  // 32 processed per loop
+      "vst1.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 32
+      "bgt         1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs        %1, %1, #16                   \n"  // 16 bytes per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs        %1, %1, #4                    \n"  // 4 pixels per loop
+      "vst1.8      {q0}, [%0]!                   \n"  // store
+      "bgt         1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add         %0, %0, %2                    \n"
+      "sub         %0, %0, #32                   \n"  // 32 bytes per loop
+
+      "1:                                        \n"
+      "vld1.8      {q1, q2}, [%0], %3            \n"  // src -= 32
+      "subs        %2, #32                       \n"  // 32 pixels per loop.
+      "vrev64.8    q0, q2                        \n"
+      "vrev64.8    q1, q1                        \n"
+      "vswp        d0, d1                        \n"
+      "vswp        d2, d3                        \n"
+      "vst1.8      {q0, q1}, [%1]!               \n"  // dst += 32
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "r"(-32)     // %3
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %2, lsl #1            \n"
+      "sub         %0, #16                       \n"
+
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst2.8      {d0, d1}, [%1]!               \n"  // dst += 16
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_uv),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov         r12, #-16                     \n"
+      "add         %0, %0, %3, lsl #1            \n"
+      "sub         %0, #16                       \n"
+
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], r12           \n"  // src -= 16
+      "subs        %3, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    q0, q0                        \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // dst += 8
+      "vst1.8      {d1}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "add         %0, %0, %2, lsl #2            \n"
+      "sub         %0, #32                       \n"
+
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0], %3    \n"  // src -= 32
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vrev64.8    d3, d3                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // dst += 32
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(-32)         // %3
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  src_rgb24 += width * 3 - 24;
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
+      "subs        %2, #8                        \n"  // 8 pixels per loop.
+      "vrev64.8    d0, d0                        \n"
+      "vrev64.8    d1, d1                        \n"
+      "vrev64.8    d2, d2                        \n"
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"  // dst += 24
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      : "r"(-24)          // %3
+      : "cc", "memory", "d0", "d1", "d2");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d4, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d4, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d1, d2, d3, d4}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "vmov.u8     d0, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of RGBA.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt         1b                            \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst3.8      {d0, d2, d4}, [%1]!           \n"  // store 16 RGB24 pixels.
+      "vst3.8      {d1, d3, d5}, [%1]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vswp.u8     d1, d3                        \n"  // swap R, B
+      "vst3.8      {d1, d2, d3}, [%1]!           \n"  // store 8 pixels of RAW.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vst1.8      {q1}, [%1]!                   \n"  // store 16 pixels of Y.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d1}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%2]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row YUY2.
+      "vrhadd.u8   d1, d1, d5                    \n"  // average rows of U
+      "vrhadd.u8   d3, d3, d7                    \n"  // average rows of V
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d3}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_uyvy
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
+      "subs        %4, %4, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load next row UYVY.
+      "vrhadd.u8   d0, d0, d4                    \n"  // average rows of U
+      "vrhadd.u8   d2, d2, d6                    \n"  // average rows of V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 U.
+      "vst1.8      {d2}, [%3]!                   \n"  // store 8 V.
+      "bgt         1b                            \n"
+      : "+r"(src_uyvy),     // %0
+        "+r"(stride_uyvy),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
+      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
+      "vld2.8      {q2, q3}, [%1]!               \n"  // load next row YUY2.
+      "vrhadd.u8   q4, q1, q3                    \n"  // average rows of UV
+      "vst1.8      {q4}, [%2]!                   \n"  // store 8 UV.
+      "bgt         1b                            \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_uv),       // %2
+        "+r"(width)         // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "vld1.8      {q2}, [%3]                    \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
+      "subs        %2, %2, #4                    \n"  // 4 processed per loop
+      "vtbl.8      d2, {d0, d1}, d4              \n"  // look up 2 first pixels
+      "vtbl.8      d3, {d0, d1}, d5              \n"  // look up 2 next pixels
+      "vst1.8      {q1}, [%1]!                   \n"  // store 4.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d3}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 YUY2/16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
+      "vld1.8      {d2}, [%2]!                   \n"  // load 8 Vs
+      "subs        %4, %4, #16                   \n"  // 16 pixels
+      "vst4.8      {d0, d1, d2, d3}, [%3]!       \n"  // Store 8 UYVY/16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "vst1.8      {q2}, [%1]!                   \n"  // store 8 pixels RGB565.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "d6");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vdup.32     d7, %2                        \n"  // dither4
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d0, d0, d7                    \n"
+      "vqadd.u8    d2, d2, d7                    \n"
+      "vqadd.u8    d4, d4, d7                    \n"  // add for dither
+      ARGBTORGB565
+      "vst1.8      {q2}, [%0]!                   \n"  // store 8 RGB565.
+      "bgt         1b                            \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "vst1.8      {q3}, [%1]!                   \n"  // store 8 ARGB1555.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "vst1.8      {q0}, [%1]!                   \n"  // store 8 ARGB4444.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q3}, [%1]!                   \n"  // store 16 A's.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
+// 8x1 pixels.
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+
+      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
+      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
+      "vdup.u8     d25, d0[1]                    \n"  // UG -0.5781 coefficient
+      "vdup.u8     d26, d0[2]                    \n"  // UR -0.2969 coefficient
+      "vdup.u8     d27, d0[4]                    \n"  // VB -0.1406 coefficient
+      "vdup.u8     d28, d0[5]                    \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlsl.u8    q2, d1, d25                   \n"  // G
+      "vmlsl.u8    q2, d2, d26                   \n"  // R
+
+      "vmull.u8    q3, d2, d24                   \n"  // R
+      "vmlsl.u8    q3, d1, d28                   \n"  // G
+      "vmlsl.u8    q3, d0, d27                   \n"  // B
+
+      "vaddhn.u16  d0, q2, q15                   \n"  // +128 -> unsigned
+      "vaddhn.u16  d1, q3, q15                   \n"  // +128 -> unsigned
+
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+        "q15");
+}
+
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
+// clang-format off
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
+  "vaddhn.u16 d0, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vaddhn.u16 d1, q9, q15                    \n" /* +128 -> unsigned     */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match Intel code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+                        int src_stride_rgb24,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RGB24 pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RGB24 pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RGB24 pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+                      int src_stride_raw,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_raw
+      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RAW pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RAW pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RAW pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
+      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
+      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q1, q1, #1                    \n"  // 2x average
+      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q3, q3, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q3, q2, q1)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
+      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
+      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RGB24 pixels.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RGB24 pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RGB24 pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_raw
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
+      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // load next 8 RAW pixels.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld3.8      {d8, d10, d12}, [%1]!         \n"  // load 8 more RAW pixels.
+      "vld3.8      {d9, d11, d13}, [%1]!         \n"  // load last 8 RAW pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q1, #1                    \n"
+      "vrshr.u16   q2, q2, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),         // %0
+        "+r"(src_stride_rgb565),  // %1
+        "+r"(dst_u),              // %2
+        "+r"(dst_v),              // %3
+        "+r"(width)               // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q5, q5, #1                    \n"
+      "vrshr.u16   q6, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vmul.s16    q8, q4, q10                   \n"  // B
+      "vmls.s16    q8, q5, q11                   \n"  // G
+      "vmls.s16    q8, q6, q12                   \n"  // R
+      "vadd.u16    q8, q8, q15                   \n"  // +128 -> unsigned
+      "vmul.s16    q9, q6, q10                   \n"  // R
+      "vmls.s16    q9, q5, q14                   \n"  // G
+      "vmls.s16    q9, q4, q13                   \n"  // B
+      "vadd.u16    q9, q9, q15                   \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),         // %0
+        "+r"(src_stride_argb1555),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8080                  \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%0]!                   \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8      {q0}, [%1]!                   \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8   d8, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d10, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d12, d2                       \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8      {q0}, [%1]!                   \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8   d9, d0                        \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16   q0, q4, #1                    \n"  // 2x average
+      "vrshr.u16   q1, q5, #1                    \n"
+      "vrshr.u16   q2, q6, #1                    \n"
+
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),         // %0
+        "+r"(src_stride_argb4444),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
+      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
+      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
+      "vmov.u8     d27, #16                      \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit Y
+      "vqadd.u8    d0, d27                       \n"
+      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q2}, [%0]!                   \n"
+      "vmov.u8     q1, q0                        \n"
+      "vmov.u8     q3, q2                        \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
+      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+      "vld1.8      {q4}, [%3]                    \n"  // shuffler
+
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"
+      "vld1.8      {q2}, [%0]!                   \n"
+      "vtbl.8      d2, {d0, d1}, d8              \n"
+      "vtbl.8      d3, {d0, d1}, d9              \n"
+      "vtbl.8      d6, {d4, d5}, d8              \n"
+      "vtbl.8      d7, {d4, d5}, d9              \n"
+      "vmov.u8     q0, q1                        \n"
+      "vmov.u8     q2, q3                        \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst2.8      {q0, q1}, [%1]!               \n"  // store 4 pixels
+      "vst2.8      {q2, q3}, [%1]!               \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_argb),           // %0
+        "+r"(dst_ab64),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleARGBToABGR)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vld1.16     {q2}, [%0]!                   \n"
+      "vld1.16     {q3}, [%0]!                   \n"
+      "vshrn.u16   d0, q0, #8                    \n"
+      "vshrn.u16   d1, q1, #8                    \n"
+      "vshrn.u16   d4, q2, #8                    \n"
+      "vshrn.u16   d5, q3, #8                    \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
+      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "vld1.8      {d8}, [%3]                    \n"  // shuffler
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vld1.16     {q2}, [%0]!                   \n"
+      "vld1.16     {q3}, [%0]!                   \n"
+      "vtbl.8      d0, {d0, d1}, d8              \n"
+      "vtbl.8      d1, {d2, d3}, d8              \n"
+      "vtbl.8      d4, {d4, d5}, d8              \n"
+      "vtbl.8      d5, {d6, d7}, d8              \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.8      {q0}, [%1]!                   \n"  // store 4 pixels
+      "vst1.8      {q2}, [%1]!                   \n"  // store 4 pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ab64),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleAB64ToARGB)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_y,
+                           int width,
+                           const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
+      "vdup.u8     d20, d0[0]                    \n"
+      "vdup.u8     d21, d0[1]                    \n"
+      "vdup.u8     d22, d0[2]                    \n"
+      "vdup.u16    q12, d0[2]                    \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vmull.u8    q8, d0, d20                   \n"  // B
+      "vmull.u8    q9, d1, d20                   \n"
+      "vmlal.u8    q8, d2, d21                   \n"  // G
+      "vmlal.u8    q9, d3, d21                   \n"
+      "vmlal.u8    q8, d4, d22                   \n"  // R
+      "vmlal.u8    q9, d5, d22                   \n"
+      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
+      "vaddhn.u16  d1, q9, q12                   \n"
+      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+                           uint8_t* dst_y,
+                           int width,
+                           const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
+      "vdup.u8     d20, d0[0]                    \n"
+      "vdup.u8     d21, d0[1]                    \n"
+      "vdup.u8     d22, d0[2]                    \n"
+      "vdup.u16    q12, d0[2]                    \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vmull.u8    q8, d2, d20                   \n"  // B
+      "vmull.u8    q9, d3, d20                   \n"
+      "vmlal.u8    q8, d4, d21                   \n"  // G
+      "vmlal.u8    q9, d5, d21                   \n"
+      "vmlal.u8    q8, d6, d22                   \n"  // R
+      "vmlal.u8    q9, d7, d22                   \n"
+      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
+      "vaddhn.u16  d1, q9, q12                   \n"
+      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgba),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
+      "vdup.u8     d20, d0[0]                    \n"
+      "vdup.u8     d21, d0[1]                    \n"
+      "vdup.u8     d22, d0[2]                    \n"
+      "vdup.u16    q12, d0[2]                    \n"
+      "1:                                        \n"
+      "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
+                                                      // RGB24.
+      "vld3.8      {d3, d5, d7}, [%0]!           \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vmull.u8    q8, d2, d20                   \n"  // B
+      "vmull.u8    q9, d3, d20                   \n"
+      "vmlal.u8    q8, d4, d21                   \n"  // G
+      "vmlal.u8    q9, d5, d21                   \n"
+      "vmlal.u8    q8, d6, d22                   \n"  // R
+      "vmlal.u8    q9, d7, d22                   \n"
+      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
+      "vaddhn.u16  d1, q9, q12                   \n"
+      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgb),     // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  asm volatile(
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_stride),  // %2
+        "+r"(dst_width),   // %3
+        "+r"(y1_fraction)  // %4
+      :
+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int dst_width,
+                            int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  asm volatile(
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+
+      "vdup.16     d17, %4                       \n"
+      "vdup.16     d16, %5                       \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.16     {q0}, [%1]!                   \n"
+      "vld1.16     {q1}, [%2]!                   \n"
+      "subs        %3, %3, #8                    \n"
+      "vmull.u16   q2, d0, d16                   \n"
+      "vmull.u16   q3, d1, d16                   \n"
+      "vmlal.u16   q2, d2, d17                   \n"
+      "vmlal.u16   q3, d3, d17                   \n"
+      "vrshrn.u32  d0, q2, #8                    \n"
+      "vrshrn.u32  d1, q3, #8                    \n"
+      "vst1.16     {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.16     {q0}, [%1]!                   \n"
+      "vld1.16     {q1}, [%2]!                   \n"
+      "subs        %3, %3, #8                    \n"
+      "vrhadd.u16  q0, q1                        \n"
+      "vst1.16     {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.16     {q0}, [%1]!                   \n"
+      "subs        %3, %3, #8                    \n"
+      "vst1.16     {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_ptr1),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(y1_fraction),  // %4
+        "r"(y0_fraction)   // %5
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs        %3, #8                        \n"
+      "blt         89f                           \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB0.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 pixels of ARGB1.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 pixels of ARGB.
+      "bge         8b                            \n"
+
+      "89:                                       \n"
+      "adds        %3, #8-1                      \n"
+      "blt         99f                           \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "vld4.8      {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8      {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs        %3, %3, #1                    \n"  // 1 processed per loop.
+      "vmull.u8    q10, d4, d3                   \n"  // db * a
+      "vmull.u8    q11, d5, d3                   \n"  // dg * a
+      "vmull.u8    q12, d6, d3                   \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8    q2, q2, q10                   \n"  // dbg - dbg * a / 256
+      "vqsub.u8    d6, d6, d22                   \n"  // dr - dr * a / 256
+      "vqadd.u8    q0, q0, q2                    \n"  // + sbg
+      "vqadd.u8    d2, d2, d6                    \n"  // + sr
+      "vmov.u8     d3, #255                      \n"  // a = 255
+      "vst4.8      {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge         1b                            \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
+
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q10, d0, d3                   \n"  // b * a
+      "vmull.u8    q11, d1, d3                   \n"  // g * a
+      "vmull.u8    q12, d2, d3                   \n"  // r * a
+      "vaddhn.u16  d0, q10, q15                  \n"  // (b + 255) >> 8
+      "vaddhn.u16  d1, q11, q15                  \n"  // (g + 255) >> 8
+      "vaddhn.u16  d2, q12, q15                  \n"  // (r + 255) >> 8
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "vdup.u16    q8, %2                        \n"
+      "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
+      "vdup.u16    q9, %3                        \n"  // interval multiply.
+      "vdup.u16    q10, %4                       \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]        \n"  // load 8 pixels of ARGB.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q0, d0                        \n"  // b (0 .. 255)
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q2, d4                        \n"
+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+      "vqdmulh.s16 q1, q1, q8                    \n"  // g
+      "vqdmulh.s16 q2, q2, q8                    \n"  // r
+      "vmul.u16    q0, q0, q9                    \n"  // b * interval_size
+      "vmul.u16    q1, q1, q9                    \n"  // g
+      "vmul.u16    q2, q2, q9                    \n"  // r
+      "vadd.u16    q0, q0, q10                   \n"  // b + interval_offset
+      "vadd.u16    q1, q1, q10                   \n"  // g
+      "vadd.u16    q2, q2, q10                   \n"  // r
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d2, q1                        \n"
+      "vqmovn.u16  d4, q2                        \n"
+      "vst4.8      {d0, d2, d4, d6}, [%0]!       \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "vdup.u32    q0, %3                        \n"  // duplicate scale value.
+      "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
+      "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d20, d22, d24, d26}, [%0]!   \n"  // load 8 pixels of ARGB.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q10, d20                      \n"  // b (0 .. 255)
+      "vmovl.u8    q11, d22                      \n"
+      "vmovl.u8    q12, d24                      \n"
+      "vmovl.u8    q13, d26                      \n"
+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+      "vqmovn.u16  d20, q10                      \n"
+      "vqmovn.u16  d22, q11                      \n"
+      "vqmovn.u16  d24, q12                      \n"
+      "vqmovn.u16  d26, q13                      \n"
+      "vst4.8      {d20, d22, d24, d26}, [%1]!   \n"  // store 8 pixels of ARGB.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
+      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
+      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d24                   \n"  // B
+      "vmlal.u8    q2, d1, d25                   \n"  // G
+      "vmlal.u8    q2, d2, d26                   \n"  // R
+      "vqrshrn.u16 d0, q2, #8                    \n"  // 16 bit to 8 bit B
+      "vmov        d1, d0                        \n"  // G
+      "vmov        d2, d0                        \n"  // R
+      "vst4.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8     d20, #17                      \n"  // BB coefficient
+      "vmov.u8     d21, #68                      \n"  // BG coefficient
+      "vmov.u8     d22, #35                      \n"  // BR coefficient
+      "vmov.u8     d24, #22                      \n"  // GB coefficient
+      "vmov.u8     d25, #88                      \n"  // GG coefficient
+      "vmov.u8     d26, #45                      \n"  // GR coefficient
+      "vmov.u8     d28, #24                      \n"  // BB coefficient
+      "vmov.u8     d29, #98                      \n"  // BG coefficient
+      "vmov.u8     d30, #50                      \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]        \n"  // load 8 ARGB pixels.
+      "subs        %1, %1, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d20                   \n"  // B to Sepia B
+      "vmlal.u8    q2, d1, d21                   \n"  // G
+      "vmlal.u8    q2, d2, d22                   \n"  // R
+      "vmull.u8    q3, d0, d24                   \n"  // B to Sepia G
+      "vmlal.u8    q3, d1, d25                   \n"  // G
+      "vmlal.u8    q3, d2, d26                   \n"  // R
+      "vmull.u8    q8, d0, d28                   \n"  // B to Sepia R
+      "vmlal.u8    q8, d1, d29                   \n"  // G
+      "vmlal.u8    q8, d2, d30                   \n"  // R
+      "vqshrn.u16  d0, q2, #7                    \n"  // 16 bit to 8 bit B
+      "vqshrn.u16  d1, q3, #7                    \n"  // 16 bit to 8 bit G
+      "vqshrn.u16  d2, q8, #7                    \n"  // 16 bit to 8 bit R
+      "vst4.8      {d0, d1, d2, d3}, [%0]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+        "q14", "q15");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
+      "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
+      "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "vld4.8      {d16, d18, d20, d22}, [%0]!   \n"  // load 8 ARGB pixels.
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vmovl.u8    q8, d16                       \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8    q9, d18                       \n"  // g
+      "vmovl.u8    q10, d20                      \n"  // r
+      "vmovl.u8    q11, d22                      \n"  // a
+      "vmul.s16    q12, q8, d0[0]                \n"  // B = B * Matrix B
+      "vmul.s16    q13, q8, d1[0]                \n"  // G = B * Matrix G
+      "vmul.s16    q14, q8, d2[0]                \n"  // R = B * Matrix R
+      "vmul.s16    q15, q8, d3[0]                \n"  // A = B * Matrix A
+      "vmul.s16    q4, q9, d0[1]                 \n"  // B += G * Matrix B
+      "vmul.s16    q5, q9, d1[1]                 \n"  // G += G * Matrix G
+      "vmul.s16    q6, q9, d2[1]                 \n"  // R += G * Matrix R
+      "vmul.s16    q7, q9, d3[1]                 \n"  // A += G * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q10, d0[2]                \n"  // B += R * Matrix B
+      "vmul.s16    q5, q10, d1[2]                \n"  // G += R * Matrix G
+      "vmul.s16    q6, q10, d2[2]                \n"  // R += R * Matrix R
+      "vmul.s16    q7, q10, d3[2]                \n"  // A += R * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vmul.s16    q4, q11, d0[3]                \n"  // B += A * Matrix B
+      "vmul.s16    q5, q11, d1[3]                \n"  // G += A * Matrix G
+      "vmul.s16    q6, q11, d2[3]                \n"  // R += A * Matrix R
+      "vmul.s16    q7, q11, d3[3]                \n"  // A += A * Matrix A
+      "vqadd.s16   q12, q12, q4                  \n"  // Accumulate B
+      "vqadd.s16   q13, q13, q5                  \n"  // Accumulate G
+      "vqadd.s16   q14, q14, q6                  \n"  // Accumulate R
+      "vqadd.s16   q15, q15, q7                  \n"  // Accumulate A
+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+      "vst4.8      {d16, d18, d20, d22}, [%1]!   \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q0, d0, d1                    \n"  // multiply B
+      "vmull.u8    q1, d2, d3                    \n"  // multiply G
+      "vmull.u8    q2, d4, d5                    \n"  // multiply R
+      "vmull.u8    q3, d6, d7                    \n"  // multiply A
+      "vrshrn.u16  d0, q0, #8                    \n"  // 16 bit to 8 bit B
+      "vrshrn.u16  d1, q1, #8                    \n"  // 16 bit to 8 bit G
+      "vrshrn.u16  d2, q2, #8                    \n"  // 16 bit to 8 bit R
+      "vrshrn.u16  d3, q3, #8                    \n"  // 16 bit to 8 bit A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    q0, q0, q2                    \n"  // add B, G
+      "vqadd.u8    q1, q1, q3                    \n"  // add R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d4, d5, d6, d7}, [%1]!       \n"  // load 8 more ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqsub.u8    q0, q0, q2                    \n"  // subtract B, G
+      "vqsub.u8    q1, q1, q3                    \n"  // subtract R, A
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d1}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d0, d0, d1                    \n"  // add
+      "vmov.u8     d1, d0                        \n"
+      "vmov.u8     d2, d0                        \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
+      "vld1.8      {q1}, [%1]!                   \n"  // load 16 sobely.
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vqadd.u8    q0, q0, q1                    \n"  // add
+      "vst1.8      {q0}, [%2]!                   \n"  // store 16 pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "vmov.u8     d3, #255                      \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 sobelx.
+      "vld1.8      {d0}, [%1]!                   \n"  // load 8 sobely.
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vqadd.u8    d1, d0, d2                    \n"  // add
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"  // store 8 ARGB pixels.
+      "bgt         1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0],%5                 \n"  // top
+      "vld1.8      {d1}, [%0],%6                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%1],%5                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%6                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%2],%5                 \n"  // bottom
+      "vld1.8      {d3}, [%2],%6                 \n"
+      "subs        %4, %4, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%3]!                   \n"  // store 8 sobelx
+      "bgt         1b                            \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(src_y2),               // %2
+        "+r"(dst_sobelx),           // %3
+        "+r"(width)                 // %4
+      : "r"(2),                     // %5
+        "r"(6)                      // %6
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0],%4                 \n"  // left
+      "vld1.8      {d1}, [%1],%4                 \n"
+      "vsubl.u8    q0, d0, d1                    \n"
+      "vld1.8      {d2}, [%0],%4                 \n"  // center * 2
+      "vld1.8      {d3}, [%1],%4                 \n"
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vld1.8      {d2}, [%0],%5                 \n"  // right
+      "vld1.8      {d3}, [%1],%5                 \n"
+      "subs        %3, %3, #8                    \n"  // 8 pixels
+      "vsubl.u8    q1, d2, d3                    \n"
+      "vadd.s16    q0, q0, q1                    \n"
+      "vabs.s16    q0, q0                        \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 sobely
+      "bgt         1b                            \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(dst_sobely),           // %2
+        "+r"(width)                 // %3
+      : "r"(1),                     // %4
+        "r"(6)                      // %5
+      : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u16   q2, d2                        \n"  // 8 int's
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d3, q3, #13                   \n"
+      "vst1.8      {q1}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
+      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
+      "vmovl.u8    q1, d2                        \n"  // 8 shorts
+      "vmovl.u16   q2, d2                        \n"  // 8 ints
+      "vmovl.u16   q3, d3                        \n"
+      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
+      "vcvt.f32.u32 q3, q3                       \n"
+      "vmul.f32    q2, q2, %y3                   \n"  // scale
+      "vmul.f32    q3, q3, %y3                   \n"
+      "vst1.8      {q2, q3}, [%1]!               \n"  // store 8 floats
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "vmov.u16    d6, #4                        \n"  // constant 4
+      "vmov.u16    d7, #6                        \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"  // load 8 samples, 5 rows
+      "vld1.16     {q2}, [%4]!                   \n"
+      "vaddl.u16   q0, d2, d4                    \n"  // * 1
+      "vaddl.u16   q1, d3, d5                    \n"  // * 1
+      "vld1.16     {q2}, [%1]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "vld1.16     {q2}, [%2]!                   \n"
+      "vmlal.u16   q0, d4, d7                    \n"  // * 6
+      "vmlal.u16   q1, d5, d7                    \n"  // * 6
+      "vld1.16     {q2}, [%3]!                   \n"
+      "vmlal.u16   q0, d4, d6                    \n"  // * 4
+      "vmlal.u16   q1, d5, d6                    \n"  // * 4
+      "subs        %6, %6, #8                    \n"  // 8 processed per loop
+      "vst1.32     {q0, q1}, [%5]!               \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "vmov.u32    q10, #4                       \n"  // constant 4
+      "vmov.u32    q11, #6                       \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
+      "vld1.32     {q2}, [%0]                    \n"
+      "vadd.u32    q0, q0, q1                    \n"  // * 1
+      "vadd.u32    q1, q1, q2                    \n"  // * 1
+      "vld1.32     {q2, q3}, [%2]!               \n"
+      "vmla.u32    q0, q2, q11                   \n"  // * 6
+      "vmla.u32    q1, q3, q11                   \n"  // * 6
+      "vld1.32     {q2, q3}, [%1]!               \n"
+      "vld1.32     {q8, q9}, [%3]!               \n"
+      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
+      "vadd.u32    q3, q3, q9                    \n"
+      "vmla.u32    q0, q2, q10                   \n"  // * 4
+      "vmla.u32    q1, q3, q10                   \n"  // * 4
+      "subs        %5, %5, #8                    \n"  // 8 processed per loop
+      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
+      "vqshrn.u32  d1, q1, #8                    \n"
+      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
+      "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
+      "vmov        d1, d0                        \n"
+      "vzip.u8     d0, d1                        \n"  // VV
+      "vmov        d3, d2                        \n"
+      "vzip.u8     d2, d3                        \n"  // UU
+      "subs        %3, %3, #16                   \n"  // 16 pixels per loop
+      "vst3.8      {d0, d2, d4}, [%2]!           \n"  // store 16 YUV pixels
+      "vst3.8      {d1, d3, d5}, [%2]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d0, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels UV.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_uv),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  asm volatile(
+      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8   q0, q0                        \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d1, q1, #2                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop.
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store 8 pixels VU.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_vu),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q2}, [%1]!                   \n"  // store 16 Y's.
+      "bgt         1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
+      "vld2.8      {d1, d3}, [%0]!               \n"
+      "vorr.u8     q2, q0, q0                    \n"  // move U after V
+      "subs        %2, %2, #16                   \n"  // 16 pixels per loop
+      "vst2.8      {q1, q2}, [%1]!               \n"  // store 16 VU pixels
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
+      "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
+      "vld1.8      {q2}, [%1]!                   \n"
+      "vld1.8      {q3}, [%3]!                   \n"
+      "vpaddl.u8   q0, q0                        \n"  // half size
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q1, q3                        \n"
+      "vqrshrn.u16 d0, q0, #2                    \n"
+      "vqrshrn.u16 d1, q1, #2                    \n"
+      "subs        %5, %5, #16                   \n"  // 16 src pixels per loop
+      "vst2.8      {d0, d1}, [%4]!               \n"  // store 8 UV pixels
+      "bgt         1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  int shift = depth - 16;  // Negative for right shift.
+  asm volatile(
+      "vdup.16     q2, %4                        \n"
+      "1:                                        \n"
+      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
+      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  int shift = 16 - depth;
+  asm volatile(
+      "vdup.16     q2, %4                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
+      "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
+      "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
+      "bgt         1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  asm volatile(
+      "vdup.16     q2, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vmul.u16    q0, q0, q2                    \n"
+      "vmul.u16    q1, q1, q2                    \n"
+      "vst1.16     {q0}, [%1]!                   \n"
+      "vst1.16     {q1}, [%1]!                   \n"
+      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  asm volatile(
+      "vdup.16     d8, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q2, q3}, [%0]!               \n"
+      "vmull.u16   q0, d4, d8                    \n"
+      "vmull.u16   q1, d5, d8                    \n"
+      "vmull.u16   q2, d6, d8                    \n"
+      "vmull.u16   q3, d7, d8                    \n"
+      "vshrn.u32   d0, q0, #16                   \n"
+      "vshrn.u32   d1, q1, #16                   \n"
+      "vshrn.u32   d2, q2, #16                   \n"
+      "vshrn.u32   d3, q3, #16                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
+      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+  asm volatile(
+      "vdup.16     q2, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vshl.u16    q0, q0, q2                    \n"  // shr = q2 is negative
+      "vshl.u16    q1, q1, q2                    \n"
+      "vqmovn.u16  d0, q0                        \n"
+      "vqmovn.u16  d1, q1                        \n"
+      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(shift)    // %3
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
new file mode 100644
index 00000000..1679f87c
--- /dev/null
+++ b/source/row_neon64.cc
@@ -0,0 +1,4630 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
+// STn over ZIP1+ST1
+// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// v0.8h: Y
+// v1.16b: 8U, 8V
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                               \
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ld1        {v1.s}[0], [%[src_u]], #4      \n" \
+  "ld1        {v1.s}[1], [%[src_v]], #4      \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "zip1       v1.16b, v1.16b, v1.16b         \n" \
+  "prfm       pldl1keep, [%[src_u], 128]     \n" \
+  "prfm       pldl1keep, [%[src_v], 128]     \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                               \
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ld1        {v1.d}[0], [%[src_u]], #8      \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "ld1        {v1.d}[1], [%[src_v]], #8      \n" \
+  "prfm       pldl1keep, [%[src_u], 448]     \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_v], 448]     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                               \
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "movi       v1.16b, #128                   \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n"
+
+static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
+                                 1, 1, 3, 3, 5, 5, 7, 7};
+static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
+                                 0, 0, 2, 2, 4, 4, 6, 6};
+
+// Read 8 Y and 4 UV from NV12 or NV21
+#define READNV12                                 \
+  "ldr        d0, [%[src_y]], #8             \n" \
+  "ldr        d1, [%[src_uv]], #8            \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "tbl        v1.16b, {v1.16b}, v2.16b       \n" \
+  "prfm       pldl1keep, [%[src_uv], 448]    \n"
+
+// Read 8 YUY2
+#define READYUY2                                     \
+  "ld2        {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
+  "zip1       v0.16b, v0.16b, v0.16b         \n"     \
+  "prfm       pldl1keep, [%[src_yuy2], 448]  \n"     \
+  "tbl        v1.16b, {v1.16b}, v2.16b       \n"
+
+// Read 8 UYVY
+#define READUYVY                                     \
+  "ld2        {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
+  "zip1       v0.16b, v4.16b, v4.16b         \n"     \
+  "prfm       pldl1keep, [%[src_uyvy], 448]  \n"     \
+  "tbl        v1.16b, {v3.16b}, v2.16b       \n"
+
+// UB VR UG VG
+// YG BB BG BR
+#define YUVTORGB_SETUP                                                \
+  "ld4r       {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
+  "ld4r       {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
+
+// v16.8h: B
+// v17.8h: G
+// v18.8h: R
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB                                          \
+  "umull2     v3.4s, v0.8h, v24.8h           \n"          \
+  "umull      v6.8h, v1.8b, v30.8b           \n"          \
+  "umull      v0.4s, v0.4h, v24.4h           \n"          \
+  "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
+  "uqshrn     v0.4h, v0.4s, #16              \n"          \
+  "uqshrn2    v0.8h, v3.4s, #16              \n" /* Y */  \
+  "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
+  "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
+  "add        v17.8h, v0.8h, v26.8h          \n" /* G */  \
+  "add        v16.8h, v0.8h, v4.8h           \n" /* B */  \
+  "add        v18.8h, v0.8h, v5.8h           \n" /* R */  \
+  "uqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
+  "uqsub      v16.8h, v16.8h, v25.8h         \n" /* B */  \
+  "uqsub      v18.8h, v18.8h, v27.8h         \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8                                \
+  "uqshrn     v17.8b, v17.8h, #6             \n" \
+  "uqshrn     v16.8b, v16.8h, #6             \n" \
+  "uqshrn     v18.8b, v18.8h, #6             \n"
+
+#define YUVTORGB_REGS                                                          \
+  "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
+      "v26", "v27", "v28", "v29", "v30", "v31"
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I444ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV444 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n"
+      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
+      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n"
+      "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
+      "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [src_a] "+r"(src_a),                               // %[src_a]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v15.8b, #255                  \n" /* A */
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTORGB565                                                        \
+  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
+  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
+  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
+  "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
+  "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS);
+}
+
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v19.8b, #8             \n" /* A                    */ \
+  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
+  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
+  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v18.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v17.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v16.8h, #11            \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n" ARGBTOARGB1555
+      "st1         {v0.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
+                                                        // RGB565.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+#define ARGBTOARGB4444                                                       \
+  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f       */ \
+  "ushr       v16.8b, v16.8b, #4             \n" /* B                    */  \
+  "bic        v17.8b, v17.8b, v23.8b         \n" /* G                    */  \
+  "ushr       v18.8b, v18.8b, #4             \n" /* R                    */  \
+  "bic        v19.8b, v19.8b, v23.8b         \n" /* A                    */  \
+  "orr        v0.8b,  v16.8b, v17.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v18.8b, v19.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v23.16b, #0x0f                \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n" READYUV422 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "movi        v19.8b, #255                  \n" ARGBTOARGB4444
+      "st1         {v0.8h}, [%[dst_argb4444]], #16 \n"  // store 8
+                                                        // pixels
+                                                        // ARGB4444.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width)                                // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+#if LIBYUV_USE_ST4
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v23.8b, #255                  \n"
+      "1:                                        \n"
+      "ld1         {v20.8b}, [%0], #8            \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v21.8b, v20.8b, v20.8b        \n"
+      "orr         v22.8b, v20.8b, v20.8b        \n"
+      "subs        %w2, %w2, #8                  \n"
+      "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+#else
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v20.8b, #255                  \n"
+      "1:                                        \n"
+      "ldr         d16, [%0], #8                 \n"
+      "subs        %w2, %w2, #8                  \n"
+      "zip1        v18.16b, v16.16b, v16.16b     \n"  // YY
+      "zip1        v19.16b, v16.16b, v20.16b     \n"  // YA
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v16.16b, v18.16b, v19.16b     \n"  // YYYA
+      "zip2        v17.16b, v18.16b, v19.16b     \n"
+      "stp         q16, q17, [%1], #32           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
+}
+#endif  // LIBYUV_USE_ST4
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_vu),                              // %[src_uv]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV21Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_vu),                              // %[src_uv]
+        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV21Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
+                                                       // pixels
+                                                       // RGB565.
+      "b.gt        1b                            \n"
+      : [src_y] "+r"(src_y),                                // %[src_y]
+        [src_uv] "+r"(src_uv),                              // %[src_uv]
+        [dst_rgb565] "+r"(dst_rgb565),                      // %[dst_rgb565]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kNV12Table]]           \n"
+      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
+      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
+        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
+        [width] "+r"(width)                                 // %[width]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
+        [kNV12Table] "r"(&kNV12Table)
+      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store U
+      "st1         {v1.16b}, [%2], #16           \n"  // store V
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %3            \n"  // load 16 bytes
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 1792]         \n"  // 7 tiles of 256b ahead
+      "st1         {v0.16b}, [%1], #16           \n"  // store 16 bytes
+      "b.gt        1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "v0"  // Clobber List
+  );
+}
+
+// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
+void DetileRow_16_NEON(const uint16_t* src,
+                       ptrdiff_t src_tile_stride,
+                       uint16_t* dst,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 3584]         \n"  // 7 tiles of 512b ahead
+      "st1         {v0.8h,v1.8h}, [%1], #32      \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "r"(src_tile_stride * 2)    // %3
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8b,v1.8b}, [%0], %4       \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "st1         {v0.8b}, [%1], #8             \n"
+      "st1         {v1.8b}, [%2], #8             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3
+      : "r"(src_tile_stride)        // %4
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+#if LIBYUV_USE_ST2
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "subs        %w3, %w3, #16                 \n"  // store 8 YUY2
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                // %0
+        "+r"(src_uv),               // %1
+        "+r"(dst_yuy2),             // %2
+        "+r"(width)                 // %3
+      : "r"(src_y_tile_stride),     // %4
+        "r"(src_uv_tile_stride)     // %5
+      : "cc", "memory", "v0", "v1"  // Clobber list
+  );
+}
+#else
+// Read 16 Y, 8 UV, and write 8 YUY2
+void DetileToYUY2_NEON(const uint8_t* src_y,
+                       ptrdiff_t src_y_tile_stride,
+                       const uint8_t* src_uv,
+                       ptrdiff_t src_uv_tile_stride,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
+      "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%0, 1792]         \n"
+      "zip1        v2.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 1792]         \n"
+      "zip2        v3.16b, v0.16b, v1.16b        \n"
+      "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 8 YUY2
+      "b.gt        1b                            \n"
+      : "+r"(src_y),                            // %0
+        "+r"(src_uv),                           // %1
+        "+r"(dst_yuy2),                         // %2
+        "+r"(width)                             // %3
+      : "r"(src_y_tile_stride),                 // %4
+        "r"(src_uv_tile_stride)                 // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber list
+  );
+}
+#endif
+
+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v7.16b}, [%0], #16           \n"
+      "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
+      "shl         v4.16b, v7.16b, #6            \n"
+      "shl         v5.16b, v7.16b, #4            \n"
+      "shl         v6.16b, v7.16b, #2            \n"
+      "subs        %2, %2, #80                   \n"
+      "zip1        v16.16b, v4.16b, v0.16b       \n"
+      "zip1        v18.16b, v5.16b, v1.16b       \n"
+      "zip1        v20.16b, v6.16b, v2.16b       \n"
+      "zip1        v22.16b, v7.16b, v3.16b       \n"
+      "zip2        v17.16b, v4.16b, v0.16b       \n"
+      "zip2        v19.16b, v5.16b, v1.16b       \n"
+      "zip2        v21.16b, v6.16b, v2.16b       \n"
+      "zip2        v23.16b, v7.16b, v3.16b       \n"
+      "sri         v16.8h, v16.8h, #10           \n"
+      "sri         v17.8h, v17.8h, #10           \n"
+      "sri         v18.8h, v18.8h, #10           \n"
+      "sri         v19.8h, v19.8h, #10           \n"
+      "st1         {v16.8h-v19.8h}, [%1], #64    \n"
+      "sri         v20.8h, v20.8h, #10           \n"
+      "sri         v21.8h, v21.8h, #10           \n"
+      "sri         v22.8h, v22.8h, #10           \n"
+      "sri         v23.8h, v23.8h, #10           \n"
+      "st1         {v20.8h-v23.8h}, [%1], #64    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),  // %0
+        "+r"(dst),  // %1
+        "+r"(size)  // %2
+      :
+      : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
+#if LIBYUV_USE_ST2
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load U
+      "ld1         {v1.16b}, [%1], #16           \n"  // load V
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
+      "b.gt        1b                            \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  int shift = 16 - depth;
+  asm volatile(
+      "dup         v2.8h, %w4                    \n"
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
+      "ushl        v0.8h, v0.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load U
+      "ld1         {v1.16b}, [%1], #16           \n"  // load V
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "zip1        v2.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip2        v3.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 16 pairs of UV
+      "b.gt        1b                            \n"
+      : "+r"(src_u),                            // %0
+        "+r"(src_v),                            // %1
+        "+r"(dst_uv),                           // %2
+        "+r"(width)                             // %3  // Output registers
+      :                                         // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  int shift = 16 - depth;
+  asm volatile(
+      "dup         v4.8h, %w4                    \n"
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
+      "ushl        v0.8h, v0.8h, v4.8h           \n"
+      "ushl        v1.8h, v1.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v2.8h, v0.8h, v1.8h           \n"
+      "zip2        v3.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st1         {v2.8h, v3.8h}, [%2], #32     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
+}
+#endif  // LIBYUV_USE_ST2
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store R
+      "st1         {v1.16b}, [%2], #16           \n"  // store G
+      "st1         {v2.16b}, [%3], #16           \n"  // store B
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load R
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v2.16b}, [%2], #16           \n"  // load B
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "b.gt        1b                            \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       uint8_t* dst_a,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.16b}, [%3], #16           \n"  // store B
+      "st1         {v1.16b}, [%2], #16           \n"  // store G
+      "st1         {v2.16b}, [%1], #16           \n"  // store R
+      "st1         {v3.16b}, [%4], #16           \n"  // store A
+      "b.gt        1b                            \n"
+      : "+r"(src_rgba),                         // %0
+        "+r"(dst_r),                            // %1
+        "+r"(dst_g),                            // %2
+        "+r"(dst_b),                            // %3
+        "+r"(dst_a),                            // %4
+        "+r"(width)                             // %5
+      :                                         // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+#if LIBYUV_USE_ST4
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%2], #16           \n"  // load B
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v2.16b}, [%0], #16           \n"  // load R
+      "ld1         {v3.16b}, [%3], #16           \n"  // load A
+      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_r),                            // %0
+        "+r"(src_g),                            // %1
+        "+r"(src_b),                            // %2
+        "+r"(src_a),                            // %3
+        "+r"(dst_argb),                         // %4
+        "+r"(width)                             // %5
+      :                                         // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#else
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       const uint8_t* src_a,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%2], #16           \n"  // load B
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v2.16b}, [%0], #16           \n"  // load R
+      "ld1         {v3.16b}, [%3], #16           \n"  // load A
+      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "zip1        v4.16b, v0.16b, v1.16b        \n"  // BG
+      "zip1        v5.16b, v2.16b, v3.16b        \n"  // RA
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "zip2        v6.16b, v0.16b, v1.16b        \n"  // BG
+      "zip2        v7.16b, v2.16b, v3.16b        \n"  // RA
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v0.8h, v4.8h, v5.8h           \n"  // BGRA
+      "zip2        v1.8h, v4.8h, v5.8h           \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "zip1        v2.8h, v6.8h, v7.8h           \n"
+      "zip2        v3.8h, v6.8h, v7.8h           \n"
+      "st1         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      :                  // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+  );
+}
+#endif  // LIBYUV_USE_ST4
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.16b}, [%3], #16           \n"  // store B
+      "st1         {v1.16b}, [%2], #16           \n"  // store G
+      "st1         {v2.16b}, [%1], #16           \n"  // store R
+      "b.gt        1b                            \n"
+      : "+r"(src_rgba),                         // %0
+        "+r"(dst_r),                            // %1
+        "+r"(dst_g),                            // %2
+        "+r"(dst_b),                            // %3
+        "+r"(width)                             // %4
+      :                                         // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "movi        v3.16b, #255                  \n"  // load A(255)
+      "1:                                        \n"
+      "ld1         {v2.16b}, [%0], #16           \n"  // load R
+      "ld1         {v1.16b}, [%1], #16           \n"  // load G
+      "ld1         {v0.16b}, [%2], #16           \n"  // load B
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n"  // store 16ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_r),                            // %0
+        "+r"(src_g),                            // %1
+        "+r"(src_b),                            // %2
+        "+r"(dst_argb),                         // %3
+        "+r"(width)                             // %4
+      :                                         // Input registers
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint8_t* dst_ar30,
+                       int depth,
+                       int width) {
+  int shift = 10 - depth;
+  asm volatile(
+      "movi        v30.16b, #255                 \n"
+      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
+      "dup         v31.4s, %w5                   \n"
+      "1:                                        \n"
+      "ldr         d2, [%2], #8                  \n"  // B
+      "ldr         d1, [%1], #8                  \n"  // G
+      "ldr         d0, [%0], #8                  \n"  // R
+      "ushll       v2.4s, v2.4h, #0              \n"  // B
+      "ushll       v1.4s, v1.4h, #0              \n"  // G
+      "ushll       v0.4s, v0.4h, #0              \n"  // R
+      "ushl        v2.4s, v2.4s, v31.4s          \n"  // 000B
+      "ushl        v1.4s, v1.4s, v31.4s          \n"  // G
+      "ushl        v0.4s, v0.4s, v31.4s          \n"  // R
+      "umin        v2.4s, v2.4s, v30.4s          \n"
+      "umin        v1.4s, v1.4s, v30.4s          \n"
+      "umin        v0.4s, v0.4s, v30.4s          \n"
+      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
+      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
+      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
+      "subs        %w4, %w4, #4                  \n"
+      "str         q2, [%3], #16                 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+                          const uint16_t* src_g,
+                          const uint16_t* src_b,
+                          uint8_t* dst_ar30,
+                          int /* depth */,
+                          int width) {
+  asm volatile(
+      "movi        v30.16b, #255                 \n"
+      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
+      "1:                                        \n"
+      "ldr         d2, [%2], #8                  \n"  // B
+      "ldr         d1, [%1], #8                  \n"  // G
+      "ldr         d0, [%0], #8                  \n"  // R
+      "ushll       v2.4s, v2.4h, #0              \n"  // 000B
+      "ushll       v1.4s, v1.4h, #0              \n"  // G
+      "ushll       v0.4s, v0.4h, #0              \n"  // R
+      "umin        v2.4s, v2.4s, v30.4s          \n"
+      "umin        v1.4s, v1.4s, v30.4s          \n"
+      "umin        v0.4s, v0.4s, v30.4s          \n"
+      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
+      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
+      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
+      "subs        %w4, %w4, #4                  \n"
+      "str         q2, [%3], #16                 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar30),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v30");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       const uint16_t* src_a,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "dup         v30.8h, %w7                   \n"
+      "dup         v31.8h, %w6                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q3, [%3], #16                 \n"  // A
+      "umin        v2.8h, v2.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umin        v1.8h, v1.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umin        v0.8h, v0.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umin        v3.8h, v3.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_ar64),  // %4
+        "+r"(width)      // %5
+      : "r"(shift),      // %6
+        "r"(mask)        // %7
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+                       const uint16_t* src_g,
+                       const uint16_t* src_b,
+                       uint16_t* dst_ar64,
+                       int depth,
+                       int width) {
+  int shift = 16 - depth;
+  int mask = (1 << depth) - 1;
+  asm volatile(
+
+      "movi        v3.16b, #0xff                 \n"  // A (0xffff)
+      "dup         v30.8h, %w6                   \n"
+      "dup         v31.8h, %w5                   \n"
+
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "umin        v2.8h, v2.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umin        v1.8h, v1.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umin        v0.8h, v0.8h, v30.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_ar64),  // %3
+        "+r"(width)      // %4
+      : "r"(shift),      // %5
+        "r"(mask)        // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            const uint16_t* src_a,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "dup         v31.8h, %w6                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q3, [%3], #16                 \n"  // A
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn       v3.8b, v3.8h                  \n"
+      "subs        %w5, %w5, #8                  \n"
+      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : "r"(shift)       // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+                            const uint16_t* src_g,
+                            const uint16_t* src_b,
+                            uint8_t* dst_argb,
+                            int depth,
+                            int width) {
+  int shift = 8 - depth;
+  asm volatile(
+
+      "dup         v31.8h, %w5                   \n"
+      "movi        v3.8b, #0xff                  \n"  // A (0xff)
+      "1:                                        \n"
+      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q1, [%1], #16                 \n"  // G
+      "ldr         q0, [%2], #16                 \n"  // B
+      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "subs        %w4, %w4, #8                  \n"
+      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : "r"(shift)       // %5
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
+      "stp         q0, q1, [%1], #32             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
+      "st1         {v0.16b}, [%0], #16           \n"  // store
+      "b.gt        1b                            \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "v0");
+}
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v3.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q2, [%0, 16]                  \n"
+      "ldr         q1, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
+      "tbl         v0.16b, {v2.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleMirror)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_uv),           // %1
+        "+r"(width)             // %2
+      : "r"(&kShuffleMirrorUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w3, sxtw #1          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
+      "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
+      "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
+      "st1         {v1.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_u),            // %1
+        "+r"(dst_v),            // %2
+        "+r"(width)             // %3
+      : "r"(&kShuffleMirrorUV)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1         {v4.16b}, [%3]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #2          \n"
+      "sub         %0, %0, #32                   \n"
+      "1:                                        \n"
+      "ldr         q1, [%0, 16]                  \n"
+      "ldr         q0, [%0], -32                 \n"  // src -= 32
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
+      "tbl         v2.16b, {v1.16b}, v4.16b      \n"
+      "tbl         v3.16b, {v0.16b}, v4.16b      \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleMirrorARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "ld1         {v3.16b}, [%4]                \n"  // shuffler
+      "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
+      "add         %0, %0, %w2, sxtw             \n"
+      "sub         %0, %0, #48                   \n"
+
+      "1:                                        \n"
+      "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "tbl         v0.16b, {v0.16b}, v3.16b      \n"
+      "tbl         v1.16b, {v1.16b}, v3.16b      \n"
+      "tbl         v2.16b, {v2.16b}, v3.16b      \n"
+      "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),      // %0
+        "+r"(dst_rgb24),      // %1
+        "+r"(width)           // %2
+      : "r"((ptrdiff_t)-48),  // %3
+        "r"(&kShuffleMirror)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "movi        v4.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
+                                                       // RGB24.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v5.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v4.8b, v0.8b, v0.8b           \n"         // move r
+      "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  asm volatile(
+      "movi        v0.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v2.8b, v4.8b, v4.8b           \n"   // move g
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v1.8b, v5.8b, v5.8b           \n"         // move r
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_rgba),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
+      "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
+      "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt        1b                            \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // Alpha
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st3         {v0.16b,v1.16b,v2.16b}, [%1], #48 \n"  // store 8 RGB24
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "orr         v4.8b, v2.8b, v2.8b           \n"  // mov g
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
+      "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
+      "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
+      "b.gt        1b                            \n"
+      : "+r"(src_uyvy),   // %0
+        "+r"(src_uyvyb),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+  );
+}
+
+void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
+                        int stride_yuy2,
+                        uint8_t* dst_uv,
+                        int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
+      "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
+      "ld2         {v2.16b,v3.16b}, [%1], #32    \n"  // load next row
+      "urhadd      v4.16b, v1.16b, v3.16b        \n"  // average rows of UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v4.16b}, [%2], #16           \n"  // store 8 UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_uv),     // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
+      "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
+      "st1         {v1.16b}, [%1], #16           \n"  // store 4.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
+      "subs        %w4, %w4, #16                 \n"  // 16 pixels
+      "orr         v2.8b, v1.8b, v1.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
+      "orr         v3.8b, v2.8b, v2.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
+      "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
+      "subs        %w4, %w4, #16                 \n"         // 16 pixels
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTORGB565
+      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "dup         v1.4s, %w3                    \n"  // dither4
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uqadd       v16.8b, v16.8b, v1.8b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqadd       v17.8b, v17.8b, v1.8b         \n"
+      "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
+      "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_rgb),   // %1
+        "+r"(width)      // %2
+      : "r"(dither4)     // %3
+      : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB1555
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "movi        v23.16b, #0x0f                \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
+                                                                 // pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB4444
+      "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
+}
+
+#if LIBYUV_USE_ST2
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
+      "mov         v1.16b, v0.16b                \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mov         v3.16b, v2.16b                \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
+      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
+                                         10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+      "ldr         q4, [%3]                      \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
+      "tbl         v0.16b, {v0.16b}, v4.16b      \n"
+      "tbl         v2.16b, {v2.16b}, v4.16b      \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mov         v1.16b, v0.16b                \n"
+      "mov         v3.16b, v2.16b                \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
+      "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),           // %0
+        "+r"(dst_ab64),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleARGBToABGR)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#else
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ar64,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "zip1        v2.16b, v0.16b, v0.16b        \n"
+      "zip2        v3.16b, v0.16b, v0.16b        \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v4.16b, v1.16b, v1.16b        \n"
+      "zip2        v5.16b, v1.16b, v1.16b        \n"
+      "st1         {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n"  // 8 AR64
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+static const uvec8 kShuffleARGBToAB64[2] = {
+    {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
+    {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+                        uint16_t* dst_ab64,
+                        int width) {
+  asm volatile(
+      "ldp         q6, q7, [%3]                  \n"  // 2 shufflers
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 8 pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "tbl         v2.16b, {v0.16b}, v6.16b      \n"  // ARGB to AB64
+      "tbl         v3.16b, {v0.16b}, v7.16b      \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v4.16b, {v1.16b}, v6.16b      \n"
+      "tbl         v5.16b, {v1.16b}, v7.16b      \n"
+      "st1         {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n"  // 8 AR64
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_ab64),              // %1
+        "+r"(width)                  // %2
+      : "r"(&kShuffleARGBToAB64[0])  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#endif  // LIBYUV_USE_ST2
+
+static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
+                                         17, 19, 21, 23, 25, 27, 29, 31};
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "ldr         q4, [%3]                      \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
+      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
+      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ar64),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleAR64ToARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
+                                         21, 19, 17, 23, 29, 27, 25, 31};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "ldr         q4, [%3]                      \n"  // shuffler
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
+      "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
+      "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ab64),           // %0
+        "+r"(dst_argb),           // %1
+        "+r"(width)               // %2
+      : "r"(&kShuffleAB64ToARGB)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+struct RgbUVConstants {
+  uint8_t kRGBToU[4];
+  uint8_t kRGBToV[4];
+};
+
+// 8x1 pixels.
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "movi        v29.16b, #0x80                \n"  // 128.5
+
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
+      "prfm        pldl1keep, [%0, 448]          \n"
+
+      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
+      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
+      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
+
+      "addhn       v0.8b, v4.8h, v29.8h          \n"  // +128 -> unsigned
+      "addhn       v1.8b, v3.8h, v29.8h          \n"  // +128 -> unsigned
+
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
+}
+
+// RGB to bt601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = 74
+// UR -0.2969 coefficient = 38
+// VB -0.1406 coefficient = 18
+// VG -0.7344 coefficient = 94
+// VR   0.875 coefficient = 112 (ignored)
+
+static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+                                                            {18, 94, 112, 0}};
+
+// RGB to JPeg coefficients
+// UB coeff 0.500    = 127
+// UG coeff -0.33126 = 84
+// UR coeff -0.16874 = 43
+// VB coeff -0.08131 = 20
+// VG coeff -0.41869 = 107
+// VR coeff 0.500    = 127 (ignored)
+
+static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
+                                                            {20, 107, 127, 0}};
+
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24I601UVConstants);
+}
+
+void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kRgb24JPegUVConstants);
+}
+
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+// clang-format off
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
+  "addhn      v0.8b, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "addhn      v1.8b, v4.8h, v25.8h           \n" /* +128 -> unsigned     */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// TODO(fbarchard): Subsample match Intel code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
+                       int src_stride_abgr,
+                       uint8_t* dst_uj,
+                       uint8_t* dst_vj,
+                       int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+                        int src_stride_rgb24,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+                      int src_stride_raw,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
+      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
+      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
+      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
+      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v3.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
+      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v1.8h, v1.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RAW pixels.
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
+      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #1              \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),    // %0
+        "+r"(src_rgb565_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),    // %0
+        "+r"(src_argb1555_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile(
+      RGBTOUV_SETUP_REG  // sets v20-v25
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+
+      "ins         v16.D[1], v26.D[0]            \n"
+      "ins         v17.D[1], v27.D[0]            \n"
+      "ins         v18.D[1], v28.D[0]            \n"
+
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
+
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),    // %0
+        "+r"(src_argb4444_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28"
+
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+        "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
+      "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
+      "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
+      "movi        v7.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "umull       v3.8h, v0.8b, v4.8b           \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
+      "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v7.8b           \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_y,
+                           int width,
+                           const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
+      "dup         v6.16b, v0.b[0]               \n"
+      "dup         v7.16b, v0.b[1]               \n"
+      "dup         v16.16b, v0.b[2]              \n"
+      "dup         v17.8h,  v0.h[2]              \n"
+      "1:                                        \n"
+      "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
+                                                                 // pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v6.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v7.16b         \n"
+      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v16.16b        \n"
+      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17");
+}
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+                           uint8_t* dst_y,
+                           int width,
+                           const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
+      "dup         v6.16b, v0.b[0]               \n"
+      "dup         v7.16b, v0.b[1]               \n"
+      "dup         v16.16b, v0.b[2]              \n"
+      "dup         v17.8h,  v0.h[2]              \n"
+      "1:                                        \n"
+      "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
+                                                                 // pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v6.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v7.16b         \n"
+      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v16.16b        \n"
+      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgba),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  asm volatile(
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
+      "dup         v5.16b, v0.b[0]               \n"
+      "dup         v6.16b, v0.b[1]               \n"
+      "dup         v7.16b, v0.b[2]               \n"
+      "dup         v16.8h,  v0.h[2]              \n"
+      "1:                                        \n"
+      "ld3         {v2.16b,v3.16b,v4.16b}, [%0], #48 \n"  // load 16 pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "umull       v0.8h, v2.8b, v5.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v5.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v0.8h, v3.8b, v6.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v6.16b         \n"
+      "umlal       v0.8h, v4.8b, v7.8b           \n"  // R
+      "umlal2      v1.8h, v4.16b, v7.16b         \n"
+      "addhn       v0.8b, v0.8h, v16.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v16.8h          \n"
+      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgb),     // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(rgbconstants)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  asm volatile(
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.16b, %w4                   \n"
+      "dup         v4.16b, %w5                   \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "umull       v2.8h, v0.8b,  v4.8b          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull2      v3.8h, v0.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal       v2.8h, v1.8b,  v5.8b          \n"
+      "umlal2      v3.8h, v1.16b, v5.16b         \n"
+      "rshrn       v0.8b,  v2.8h, #8             \n"
+      "rshrn2      v0.16b, v3.8h, #8             \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1         {v0.16b}, [%1], #16           \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st1         {v0.16b}, [%0], #16           \n"
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "+r"(src_ptr1),     // %2
+        "+r"(dst_width),    // %3
+        "+r"(y1_fraction),  // %4
+        "+r"(y0_fraction)   // %5
+      :
+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            int dst_width,
+                            int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  asm volatile(
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.8h, %w4                    \n"
+      "dup         v4.8h, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "umull       v2.4s, v0.4h, v4.4h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull2      v3.4s, v0.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal       v2.4s, v1.4h, v5.4h           \n"
+      "umlal2      v3.4s, v1.8h, v5.8h           \n"
+      "rshrn       v0.4h, v2.4s, #8              \n"
+      "rshrn2      v0.8h, v3.4s, #8              \n"
+      "st1         {v0.8h}, [%0], #16            \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "urhadd      v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "st1         {v0.8h}, [%0], #16            \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st1         {v0.8h}, [%0], #16            \n"
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_ptr1),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(y1_fraction),  // %4
+        "r"(y0_fraction)   // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+// Bilinear filter 8x2 -> 8x1
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int dst_width,
+                               int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+
+  asm volatile(
+      "dup         v6.8h, %w6                    \n"
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.8h, %w4                    \n"
+      "dup         v4.8h, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "umull       v2.4s, v0.4h, v4.4h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull2      v3.4s, v0.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal       v2.4s, v1.4h, v5.4h           \n"
+      "umlal2      v3.4s, v1.8h, v5.8h           \n"
+      "rshrn       v0.4h, v2.4s, #8              \n"
+      "rshrn2      v0.8h, v3.4s, #8              \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "urhadd      v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ldr         q0, [%1], #16                 \n"
+      "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "str         d0, [%0], #8                  \n"  // store 8 pixels
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_ptr1),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(y1_fraction),  // %4
+        "r"(y0_fraction),  // %5
+        "r"(shift)         // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs        %w3, %w3, #8                  \n"
+      "b.lt        89f                           \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                             // pixels
+      "b.ge        8b                            \n"
+
+      "89:                                       \n"
+      "adds        %w3, %w3, #8-1                \n"
+      "b.lt        99f                           \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
+                                                           // ARGB0.
+      "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
+                                                           // ARGB1.
+      "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
+      "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
+      "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
+      "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
+      "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
+      "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
+      "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
+      "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
+      "movi        v3.8b, #255                   \n"  // a = 255
+      "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge        1b                            \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
+
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
+      "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
+      "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
+      "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
+      "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "dup         v4.8h, %w2                    \n"
+      "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
+      "dup         v5.8h, %w3                    \n"  // interval multiply.
+      "dup         v6.8h, %w4                    \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
+      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
+      "uxtl        v0.8h, v0.8b                  \n"    // b (0 .. 255)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl        v1.8h, v1.8b                  \n"
+      "uxtl        v2.8h, v2.8b                  \n"
+      "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
+      "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
+      "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
+      "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
+      "mul         v1.8h, v1.8h, v5.8h           \n"  // g
+      "mul         v2.8h, v2.8h, v5.8h           \n"  // r
+      "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
+      "add         v1.8h, v1.8h, v6.8h           \n"  // g
+      "add         v2.8h, v2.8h, v6.8h           \n"  // r
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn       v1.8b, v1.8h                  \n"
+      "uqxtn       v2.8b, v2.8h                  \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
+      "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
+      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl        v5.8h, v5.8b                  \n"
+      "uxtl        v6.8h, v6.8b                  \n"
+      "uxtl        v7.8h, v7.8b                  \n"
+      "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
+      "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
+      "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
+      "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
+      "uqxtn       v4.8b, v4.8h                  \n"
+      "uqxtn       v5.8b, v5.8h                  \n"
+      "uqxtn       v6.8b, v6.8h                  \n"
+      "uqxtn       v7.8b, v7.8h                  \n"
+      "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
+      "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
+      "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
+      "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
+      "orr         v1.8b, v0.8b, v0.8b           \n"  // G
+      "orr         v2.8b, v0.8b, v0.8b           \n"  // R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi        v20.8b, #17                   \n"  // BB coefficient
+      "movi        v21.8b, #68                   \n"  // BG coefficient
+      "movi        v22.8b, #35                   \n"  // BR coefficient
+      "movi        v24.8b, #22                   \n"  // GB coefficient
+      "movi        v25.8b, #88                   \n"  // GG coefficient
+      "movi        v26.8b, #45                   \n"  // GR coefficient
+      "movi        v28.8b, #24                   \n"  // BB coefficient
+      "movi        v29.8b, #98                   \n"  // BG coefficient
+      "movi        v30.8b, #50                   \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v20.8b          \n"    // B to Sepia B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
+      "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
+      "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
+      "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
+      "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
+      "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
+      "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
+      "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
+      "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
+      "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
+      "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
+      "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
+      "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl        v17.8h, v17.8b                \n"  // g
+      "uxtl        v18.8h, v18.8b                \n"  // r
+      "uxtl        v19.8h, v19.8b                \n"  // a
+      "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
+      "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
+      "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
+      "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
+      "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
+      "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
+      "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
+      "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
+      "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
+      "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
+      "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
+      "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
+      "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
+      "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
+      "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
+      "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
+      "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
+      "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
+      "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
+      "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
+      "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
+      "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
+      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
+      "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
+      "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
+      "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
+      "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
+      "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v4.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqadd       v1.8b, v1.8b, v5.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqadd       v2.8b, v2.8b, v6.8b           \n"
+      "uqadd       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqsub       v0.8b, v0.8b, v4.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqsub       v1.8b, v1.8b, v5.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqsub       v2.8b, v2.8b, v6.8b           \n"
+      "uqsub       v3.8b, v3.8b, v7.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "orr         v1.8b, v0.8b, v0.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "orr         v2.8b, v0.8b, v0.8b           \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
+      "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "movi        v3.8b, #255                   \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
+      "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt        1b                            \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0],%5              \n"  // top
+      "ld1         {v1.8b}, [%0],%6              \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%6              \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%2],%5              \n"  // bottom
+      "ld1         {v3.8b}, [%2],%6              \n"
+      "subs        %w4, %w4, #8                  \n"  // 8 pixels
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
+      "b.gt        1b                            \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(src_y2),                           // %2
+        "+r"(dst_sobelx),                       // %3
+        "+r"(width)                             // %4
+      : "r"(2LL),                               // %5
+        "r"(6LL)                                // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0],%4              \n"  // left
+      "ld1         {v1.8b}, [%1],%4              \n"
+      "usubl       v0.8h, v0.8b, v1.8b           \n"
+      "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
+      "ld1         {v3.8b}, [%1],%4              \n"
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "ld1         {v2.8b}, [%0],%5              \n"  // right
+      "ld1         {v3.8b}, [%1],%5              \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 pixels
+      "usubl       v1.8h, v2.8b, v3.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "add         v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "abs         v0.8h, v0.8h                  \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
+      "b.gt        1b                            \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(dst_sobely),                       // %2
+        "+r"(width)                             // %3
+      : "r"(1LL),                               // %4
+        "r"(6LL)                                // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
+      "uqshrn2     v1.8h, v3.4s, #13             \n"
+      "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
+      "b.gt        1b                            \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
+      "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
+      "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
+      "uxtl2       v3.4s, v1.8h                  \n"
+      "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
+      "scvtf       v3.4s, v3.4s                  \n"
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
+      "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
+                               float* dst,
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v1.4h                  \n"  // 8 floats
+      "fcvtl2      v3.4s, v1.8h                  \n"
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+// Convert FP16 Half Floats to FP32 Floats
+// Read a column and write a row
+void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
+                                  int src_stride,       // stride in elements
+                                  float* dst,
+                                  int width) {
+  asm volatile(
+      "cmp         %w2, #8                       \n"  // Is there 8 rows?
+      "b.lo        2f                            \n"
+      "1:                                        \n"
+      "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
+      "ld1         {v0.h}[1], [%0], %3           \n"
+      "ld1         {v0.h}[2], [%0], %3           \n"
+      "ld1         {v0.h}[3], [%0], %3           \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"
+      "ld1         {v1.h}[1], [%0], %3           \n"
+      "ld1         {v1.h}[2], [%0], %3           \n"
+      "ld1         {v1.h}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
+      "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
+      "stp         q2, q3, [%1], #32             \n"  // store 8 floats
+      "b.gt        1b                            \n"
+      "cmp         %w2, #1                       \n"  // Is there 1 value?
+      "b.lo        3f                            \n"
+      "2:                                        \n"
+      "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
+      "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
+      "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
+      "str         s2, [%1], #4                  \n"  // store 1 floats
+      "b.gt        2b                            \n"
+      "3:                                        \n"
+      : "+r"(src),                        // %0
+        "+r"(dst),                        // %1
+        "+r"(width)                       // %2
+      : "r"((ptrdiff_t)(src_stride * 2))  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Convert FP32 Floats to FP16 Half Floats
+void ConvertFP32ToFP16Row_NEON(const float* src,
+                               uint16_t* dst,  // fp16
+                               int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
+      "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fcvtn       v1.4h, v2.4s                  \n"  // 8 fp16 halffloats
+      "fcvtn2      v1.8h, v3.4s                  \n"
+      "str         q1, [%1], #16                 \n"  // store 8 fp16 halffloats
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
+      "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
+      "fmax        v6.4s, v6.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
+      "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fmax)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fsum;
+  asm volatile(
+      "movi        v5.4s, #0                     \n"  // max
+      "movi        v6.4s, #0                     \n"  // max
+
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmul        v4.4s, v2.4s, %4.s[0]         \n"
+      "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
+      "fmla        v6.4s, v2.4s, v2.4s           \n"
+      "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      "faddp       v5.4s, v5.4s, v6.4s           \n"
+      "faddp       v5.4s, v5.4s, v5.4s           \n"
+      "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fsum)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
+      "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
+      "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "movi        v6.8h, #4                     \n"  // constant 4
+      "movi        v7.8h, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
+      "ld1         {v2.8h}, [%4], #16            \n"
+      "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
+      "ld1         {v2.8h}, [%1], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "ld1         {v2.8h}, [%2], #16            \n"
+      "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
+      "ld1         {v2.8h}, [%3], #16            \n"
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "b.gt        1b                            \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "movi        v6.4s, #4                     \n"  // constant 4
+      "movi        v7.4s, #6                     \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
+      "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
+      "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
+      "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
+      "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
+      "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
+      "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
+      "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
+      "add         v3.4s, v3.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
+      "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
+      "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
+      "uqrshrn2    v0.8h, v1.4s, #8              \n"
+      "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      : "r"(32LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+                       const float* src1,
+                       const float* src2,
+                       const float* src3,
+                       const float* src4,
+                       float* dst,
+                       int width) {
+  asm volatile(
+      "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
+      "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "fadd        v1.4s, v1.4s, v5.4s           \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src0),               // %0
+        "+r"(src1),               // %1
+        "+r"(src2),               // %2
+        "+r"(src3),               // %3
+        "+r"(src4),               // %4
+        "+r"(dst),                // %5
+        "+r"(width)               // %6
+      : "r"(&kGaussCoefficients)  // %7
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+  asm volatile(
+      "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
+
+      "1:                                        \n"
+      "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
+                                                        // rows
+      "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
+      "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
+      "fadd        v1.4s, v1.4s, v2.4s           \n"
+      "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
+      "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
+      "fmla        v1.4s, v5.4s, v7.4s           \n"
+      "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
+      "fadd        v2.4s, v2.4s, v4.4s           \n"
+      "fadd        v3.4s, v3.4s, v5.4s           \n"
+      "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
+      "fmla        v1.4s, v3.4s, v6.4s           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
+      "fmul        v1.4s, v1.4s, v8.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
+      "b.gt        1b                            \n"
+      : "+r"(src),                 // %0
+        "+r"(dst),                 // %1
+        "+r"(width)                // %2
+      : "r"(&kGaussCoefficients),  // %3
+        "r"(8LL),                  // %4
+        "r"(-4LL),                 // %5
+        "r"(20LL)                  // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+#if LIBYUV_USE_ST3
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
+      "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // replicate V values
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "zip1        v1.16b, v1.16b, v1.16b        \n"  // replicate U values
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
+      "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+static const uvec8 kYUV24Shuffle[3] = {
+    {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
+    {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
+    {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "ld1         {v5.16b,v6.16b,v7.16b}, [%4]  \n"  // 3 shuffler constants
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"    // load 16 Y values
+      "ld1         {v1.16b}, [%1], #16           \n"    // load 8 VU values
+      "tbl         v2.16b, {v0.16b,v1.16b}, v5.16b \n"  // weave into YUV24
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v3.16b, {v0.16b,v1.16b}, v6.16b \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "tbl         v4.16b, {v0.16b,v1.16b}, v7.16b \n"
+      "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
+      "st1         {v2.16b,v3.16b,v4.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),            // %0
+        "+r"(src_vu),           // %1
+        "+r"(dst_yuv24),        // %2
+        "+r"(width)             // %3
+      : "r"(&kYUV24Shuffle[0])  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+#endif  // LIBYUV_USE_ST3
+
+// Note ST2 8b version is faster than zip+ST1
+
+// AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v2.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "ld1         {v2.16b}, [%3]                \n"  // shuffler
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
+      "ld1         {v1.16b}, [%0], 16            \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
+      "tbl         v0.16b, {v0.16b}, v2.16b      \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "tbl         v1.16b, {v1.16b}, v2.16b      \n"
+      "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_vu),         // %1
+        "+r"(width)           // %2
+      : "r"(&kShuffleSwapUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
+      "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
+      "ld1         {v2.16b}, [%1], #16           \n"
+      "ld1         {v3.16b}, [%3], #16           \n"
+      "uaddlp      v0.8h, v0.16b                 \n"  // half size
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uqrshrn     v0.8b, v0.8h, #2              \n"
+      "uqrshrn     v1.8b, v1.8h, #2              \n"
+      "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
+      "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  int shift = depth - 16;  // Negative for right shift.
+  asm volatile(
+      "dup         v2.8h, %w4                    \n"
+      "1:                                        \n"
+      "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "ushl        v0.8h, v0.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 U pixels
+      "st1         {v1.8h}, [%2], #16            \n"  // store 8 V pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  asm volatile(
+      "dup         v2.8h, %w3                    \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "mul         v0.8h, v0.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mul         v1.8h, v1.8h, v2.8h           \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  asm volatile(
+      "dup         v4.8h, %w3                    \n"
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "umull       v0.4s, v2.4h, v4.4h           \n"
+      "umull2      v1.4s, v2.8h, v4.8h           \n"
+      "umull       v2.4s, v3.4h, v4.4h           \n"
+      "umull2      v3.4s, v3.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "shrn        v0.4h, v0.4s, #16             \n"
+      "shrn2       v0.8h, v1.4s, #16             \n"
+      "shrn        v1.4h, v2.4s, #16             \n"
+      "shrn2       v1.8h, v3.4s, #16             \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+  asm volatile(
+      "dup         v2.8h, %w3                    \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
+      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "uqxtn2      v0.16b, v1.8h                 \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
+      "str         q0, [%1], #16                 \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(shift)    // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
new file mode 100644
index 00000000..0bf2bef6
--- /dev/null
+++ b/source/row_rvv.cc
@@ -0,0 +1,1394 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-to-nearest-up mode(0).
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                              \
+    asm volatile("csrwi vxrm, 0");                               \
+    ub = yuvconst->kUVCoeff[0];                                  \
+    vr = yuvconst->kUVCoeff[1];                                  \
+    ug = yuvconst->kUVCoeff[2];                                  \
+    vg = yuvconst->kUVCoeff[3];                                  \
+    yg = yuvconst->kRGBCoeffBias[0];                             \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                        \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                        \
+  }
+
+// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
+#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m1_t v_tmp0, v_tmp1;                                   \
+    vuint8m2_t v_y;                                              \
+    vuint16m2_t v_u_16, v_v_16;                                  \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                       \
+    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);                     \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);             \
+    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);                     \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);             \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);          \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);          \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);             \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);             \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
+  }
+
+// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
+#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
+  {                                                              \
+    vuint8m2_t v_y;                                              \
+    vl = __riscv_vsetvl_e8m2(w);                                 \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
+    v_u = __riscv_vle8_v_u8m2(src_u, vl);                        \
+    v_v = __riscv_vle8_v_u8m2(src_v, vl);                        \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
+  }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
+                 v_b_16, v_r_16)                                               \
+  {                                                                            \
+    vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
+    vuint32m8_t v_tmp5;                                                        \
+    v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl);                             \
+    v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);                        \
+    v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl);                    \
+    v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl);                             \
+    v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl);                          \
+    v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl);                           \
+    v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl);                            \
+    v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl);                        \
+    v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl);                    \
+    v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl);                      \
+    v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl);                          \
+    v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
+  }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+  {                                                          \
+    v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl);            \
+    v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
+    v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
+  }
+
+// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
+#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
+#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16)   \
+  {                                                        \
+    vuint8m1_t v_tmp0, v_tmp1;                             \
+    vuint8m2_t v_y;                                        \
+    vuint16m2_t v_u_16, v_v_16;                            \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
+    __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
+    vl = __riscv_vsetvl_e8m2(w);                           \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
+  }
+
+#ifdef HAS_ARGBTOAR64ROW_RVV
+void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e8m4(avl);
+    v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
+    v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
+    v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
+    __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
+    avl -= vl;
+    src_argb += vl;
+    dst_ar64 += vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_RVV
+void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m1(avl);
+    __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
+    v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
+    v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
+    v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
+    v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
+    v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
+    v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
+    v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
+    __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
+    avl -= vl;
+    src_argb += 4 * vl;
+    dst_ab64 += 4 * vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_RVV
+void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)4 * width;
+  do {
+    vuint16m8_t v_ar64;
+    vuint8m4_t v_argb;
+    size_t vl = __riscv_vsetvl_e16m8(avl);
+    v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
+    v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
+    __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
+    avl -= vl;
+    src_ar64 += vl;
+    dst_argb += vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_AR64TOAB64ROW_RVV
+void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
+                       uint16_t* dst_ab64,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e16m2(w);
+    vuint16m2_t v_b, v_g, v_r, v_a;
+    __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl);
+    __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl);
+    w -= vl;
+    src_ar64 += vl * 4;
+    dst_ab64 += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_RVV
+void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+  size_t avl = (size_t)width;
+  do {
+    vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
+    vuint8m1_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e16m2(avl);
+    __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
+    v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
+    v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    avl -= vl;
+    src_ab64 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (avl > 0);
+}
+#endif
+
+#ifdef HAS_RAWTOARGBROW_RVV
+void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGBAROW_RVV
+void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgba += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RAWTORGB24ROW_RVV
+void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_raw += vl * 3;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_RVV
+void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_raw += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_RVV
+void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
+                        uint8_t* dst_rgb24,
+                        int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOABGRROW_RVV
+void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_abgr += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOBGRAROW_RVV
+void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_bgra += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTORGBAROW_RVV
+void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOARGBROW_RVV
+void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_a, v_r, v_g, v_b;
+    __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_rgba += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOARGBROW_RVV
+void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_rgb24 += vl * 3;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TOARGBROW_RVV
+void I444ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444ALPHATOARGBROW_RVV
+void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I444TORGB24ROW_RVV
+void I444ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl;
+    src_v += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TOARGBROW_RVV
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422ALPHATOARGBROW_RVV
+void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_a += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGBAROW_RVV
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I422TORGB24ROW_RVV
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_I400TOARGBROW_RVV
+void I400ToARGBRow_RVV(const uint8_t* src_y,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  vuint16m4_t v_yb;
+  vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) sets to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  if (is_yb_positive) {
+    v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
+  } else {
+    v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
+  }
+  do {
+    vuint8m2_t v_y, v_out;
+    vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
+    vl = __riscv_vsetvl_e8m2(w);
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
+    v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);  // 257 * v_y
+    v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
+    if (is_yb_positive) {
+      v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
+    } else {
+      v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
+    }
+    v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_J400TOARGBROW_RVV
+void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_y;
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_COPYROW_RVV
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vse8_v_u8m8(dst, v_data, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TOARGBROW_RVV
+void NV12ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV12TORGB24ROW_RVV
+void NV12ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_uv += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TOARGBROW_RVV
+void NV21ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_NV21TORGB24ROW_RVV
+void NV21ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  do {
+    READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_vu += vl;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+
+#ifdef HAS_INTERPOLATEROW_RVV
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      // Use round-to-nearest-up mode for averaging add
+      vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    // Use round-to-nearest-up mode for vnclip
+    __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITRGBROW_RVV
+void SplitRGBRow_RVV(const uint8_t* src_rgb,
+                     uint8_t* dst_r,
+                     uint8_t* dst_g,
+                     uint8_t* dst_b,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_rgb += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGERGBROW_RVV
+void MergeRGBRow_RVV(const uint8_t* src_r,
+                     const uint8_t* src_g,
+                     const uint8_t* src_b,
+                     uint8_t* dst_rgb,
+                     int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_rgb += vl * 3;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_RVV
+void SplitARGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      uint8_t* dst_a,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_a += vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEARGBROW_RVV
+void MergeARGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      const uint8_t* src_a,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    src_a += vl;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_RVV
+void SplitXRGBRow_RVV(const uint8_t* src_argb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_r, v_r, vl);
+    __riscv_vse8_v_u8m2(dst_g, v_g, vl);
+    __riscv_vse8_v_u8m2(dst_b, v_b, vl);
+    w -= vl;
+    dst_r += vl;
+    dst_g += vl;
+    dst_b += vl;
+    src_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_RVV
+void MergeXRGBRow_RVV(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
+  do {
+    vuint8m2_t v_r, v_g, v_b;
+    v_r = __riscv_vle8_v_u8m2(src_r, vl);
+    v_g = __riscv_vle8_v_u8m2(src_g, vl);
+    v_b = __riscv_vle8_v_u8m2(src_b, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_r += vl;
+    src_g += vl;
+    src_b += vl;
+    dst_argb += vl * 4;
+    vl = __riscv_vsetvl_e8m2(w);
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SPLITUVROW_RVV
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u, v_v;
+    __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_MERGEUVROW_RVV
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m4_t v_u, v_v;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+#endif
+
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+  uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        128,
+                                                        0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080,
+                                                        0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                      0x1080,
+                                                      0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored
+#ifdef HAS_ARGBTOYMATRIXROW_RVV
+void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_argb += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBTOYROW_RVV
+void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_ARGBTOYJROW_RVV
+void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYROW_RVV
+void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_ABGRTOYJROW_RVV
+void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+#ifdef HAS_RGBATOYMATRIXROW_RVV
+void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
+                          uint8_t* dst_y,
+                          int width,
+                          const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgba += 4 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGBATOYROW_RVV
+void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RGBATOYJROW_RVV
+void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+  RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_BGRATOYROW_RVV
+void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+#ifdef HAS_RGBTOYMATRIXROW_RVV
+void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
+                         uint8_t* dst_y,
+                         int width,
+                         const struct RgbConstants* rgbconstants) {
+  assert(width != 0);
+  size_t w = (size_t)width;
+  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
+  vuint16m4_t v_addy;           // vector is to store kAddY
+  size_t vl = __riscv_vsetvl_e8m2(w);
+  v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
+  v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
+  v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
+  v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_y;
+    vuint16m4_t v_y_u16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
+    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
+    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
+    v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
+    v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
+    __riscv_vse8_v_u8m2(dst_y, v_y, vl);
+    w -= vl;
+    src_rgb += 3 * vl;
+    dst_y += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_RGB24TOYJROW_RVV
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+#endif
+
+#ifdef HAS_RAWTOYJROW_RVV
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+#endif
+
+#ifdef HAS_RGB24TOYROW_RVV
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}
+#endif
+
+#ifdef HAS_RAWTOYROW_RVV
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
+}
+#endif
+
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
+// src_argb: RGB values have already been pre-multiplied by the a.
+#ifdef HAS_ARGBBLENDROW_RVV
+void ARGBBlendRow_RVV(const uint8_t* src_argb,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  size_t w = (size_t)width;
+  size_t vl = __riscv_vsetvlmax_e8m2();
+  // clamp255((((256 - a) * b) >> 8) + f)
+  // = b * (256 - a) / 256 + f
+  // = b - (b * a / 256) + f
+  vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
+  do {
+    vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
+    vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
+    vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
+    vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
+    vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
+                            src_argb, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
+                            src_argb1, vl);
+
+    v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
+    v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
+    v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
+
+    v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
+    v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
+    v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
+
+    v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
+    v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
+    v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
+
+    w -= vl;
+    src_argb += 4 * vl;
+    src_argb1 += 4 * vl;
+    dst_argb += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_BLENDPLANEROW_RVV
+void BlendPlaneRow_RVV(const uint8_t* src0,
+                       const uint8_t* src1,
+                       const uint8_t* alpha,
+                       uint8_t* dst,
+                       int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint16m8_t v_dst_u16;
+    vuint8m4_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+    vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+    vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
+    vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
+
+    // (a * foreground) + (1-a) * background
+    v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
+    v_dst_u16 =
+        __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
+    v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
+    v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
+
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src0 += vl;
+    src1 += vl;
+    alpha += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+// Attenuate: (f * a + 255) >> 8
+#ifdef HAS_ARGBATTENUATEROW_RVV
+void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    // f * a
+    v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
+    v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
+    v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
+    // f * a + 255
+    v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
+    v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
+    v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
+    // (f * a + 255) >> 8
+    v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
+    v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
+    v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_RVV
+void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_b, v_g, v_r, v_a;
+    __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
+    __riscv_vse8_v_u8m2(dst_a, v_a, vl);
+    w -= vl;
+    src_argb += vl * 4;
+    dst_a += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
+void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  const ptrdiff_t dst_stride = 4;
+  dst += 3;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
+    w -= vl;
+    src += vl;
+    dst += vl * dst_stride;
+  } while (w > 0);
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/files/source/row_win.cc b/source/row_win.cc
index 27e3da7b..5fb28521 100644
--- a/files/source/row_win.cc
+++ b/source/row_win.cc
@@ -10,11 +10,13 @@
 
 #include "libyuv/row.h"
 
-// This module is for Visual C 32/64 bit and clangcl 32 bit
+// This module is for Visual C 32/64 bit
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
 
-#if defined(_M_X64)
+#if defined(_M_ARM64EC)
+#include <intrin.h>
+#elif defined(_M_X64)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -27,12 +29,34 @@ extern "C" {
 // 64 bit
 #if defined(_M_X64)
 
+// Read 8 UV from 444
+#define READYUV444                                    \
+  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
+  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
+  u_buf += 8;                                         \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
+  y_buf += 8;
+
+// Read 8 UV from 444, With 8 Alpha.
+#define READYUVA444                                   \
+  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
+  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
+  u_buf += 8;                                         \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
+  y_buf += 8;                                         \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);            \
+  a_buf += 8;
+
 // Read 4 UV from 422, upsample to 8 UV.
 #define READYUV422                                        \
-  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
+  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
   u_buf += 4;                                             \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
@@ -40,10 +64,10 @@ extern "C" {
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422                                       \
-  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
+  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
   u_buf += 4;                                             \
   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
@@ -52,24 +76,21 @@ extern "C" {
   a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                     \
-  xmm1 = _mm_loadu_si128(&xmm0);                                   \
-  xmm2 = _mm_loadu_si128(&xmm0);                                   \
-  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
-  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
-  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
-  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
-  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
-  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
-  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
-  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
-  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
-  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
-  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
-  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
-  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
-  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
-  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+#define YUVTORGB(yuvconstants)                                      \
+  xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80));             \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);   \
+  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
+  xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);  \
+  xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);  \
+  xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);  \
+  xmm0 = _mm_adds_epi16(xmm4, xmm0);                                \
+  xmm1 = _mm_subs_epi16(xmm4, xmm1);                                \
+  xmm2 = _mm_adds_epi16(xmm4, xmm2);                                \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                   \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                   \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                   \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                              \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                              \
   xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
 // Store 8 ARGB values.
@@ -90,7 +111,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                          uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
@@ -110,7 +131,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                               uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
-  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUVA422
@@ -121,6 +142,44 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
 }
 #endif
 
+#if defined(HAS_I444TOARGBROW_SSSE3)
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+  while (width > 0) {
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+  while (width > 0) {
+    READYUVA444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
 // 32 bit
 #else  // defined(_M_X64)
 #ifdef HAS_ARGBTOYROW_SSSE3
@@ -187,11 +246,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 // 7 bit fixed point 0.5.
 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
@@ -836,7 +895,7 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
 
 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
 
@@ -883,7 +942,7 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
                                                   uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
+                                                  uint32_t dither4,
                                                   int width) {
   __asm {
     mov        eax, [esp + 4]  // src_argb
@@ -1367,7 +1426,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
   }
 }
 
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1380,7 +1439,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx  // stride from u to v
@@ -1439,7 +1498,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                                           int src_stride_argb,
                                           uint8_t* dst_u,
                                           uint8_t* dst_v,
@@ -1452,7 +1511,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToVJ
     movdqa     xmm7, xmmword ptr kARGBToUJ
     sub        edi, edx  // stride from u to v
@@ -1513,7 +1572,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                                         int src_stride_argb,
                                         uint8_t* dst_u,
                                         uint8_t* dst_v,
@@ -1526,7 +1585,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
     sub        edi, edx   // stride from u to v
@@ -1581,7 +1640,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1594,9 +1653,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
     sub        edi, edx   // stride from u to v
 
  convertloop:
@@ -1649,7 +1708,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                                             uint8_t* dst_u,
                                             uint8_t* dst_v,
                                             int width) {
@@ -1659,7 +1718,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
     sub        edi, edx    // stride from u to v
@@ -1707,7 +1766,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1720,7 +1779,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kBGRAToV
     movdqa     xmm7, xmmword ptr kBGRAToU
     sub        edi, edx  // stride from u to v
@@ -1779,7 +1838,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1792,7 +1851,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kABGRToV
     movdqa     xmm7, xmmword ptr kABGRToU
     sub        edi, edx  // stride from u to v
@@ -1851,7 +1910,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
   }
 }
 
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
                                          int src_stride_argb,
                                          uint8_t* dst_u,
                                          uint8_t* dst_v,
@@ -1864,7 +1923,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
     movdqa     xmm6, xmmword ptr kRGBAToV
     movdqa     xmm7, xmmword ptr kRGBAToU
     sub        edi, edx  // stride from u to v
@@ -1926,137 +1985,153 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
 
 // Read 16 UV from 444
 #define READYUV444_AVX2 \
-  __asm {                                                \
-    __asm vmovdqu    xmm0, [esi] /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* U */                                       \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
+// Read 16 UV from 444.  With 16 Alpha.
+#define READYUVA444_AVX2 \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* U */                                       \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
+    __asm vpermq     ymm5, ymm5, 0xd8                                          \
+    __asm lea        ebp, [ebp + 16]}
+
 // Read 8 UV from 422, upsample to 16 UV.
 #define READYUV422_AVX2 \
-  __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
 #define READYUVA422_AVX2 \
-  __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
+  __asm {                                                                      \
+    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                      \
+    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
     __asm vpermq     ymm5, ymm5, 0xd8                                          \
     __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
 #define READNV12_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from NV21, upsample to 16 UV.
 #define READNV21_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi] /* UV */                     \
+  __asm {                                                                      \
+    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
     __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax] /* Y */                      \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 #define READYUY2_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
+  __asm {                                                                      \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                                    \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax] /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleYUY2UV                    \
     __asm lea        eax, [eax + 32]}
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
 #define READUYVY_AVX2 \
-  __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
+  __asm {                                                                      \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                                    \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax] /* UV */                             \
-    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
+    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleUYVYUV                    \
     __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
 #define YUVTORGB_AVX2(YuvConstants) \
-  __asm {                                    \
-    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
-    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
-    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
-    __asm vpsubw     ymm2, ymm3, ymm2                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
-    __asm vpsubw     ymm1, ymm3, ymm1                                          \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+  __asm {                                                                      \
+    __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
+    __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
+    __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
+    __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
+    __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
+    __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
+    __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm vpaddw     ymm4, ymm3, ymm4                                          \
+    __asm vpaddsw    ymm0, ymm0, ymm4                                          \
+    __asm vpsubsw    ymm1, ymm4, ymm1                                          \
+    __asm vpaddsw    ymm2, ymm2, ymm4                                          \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
-  }
+    __asm vpackuswb  ymm0, ymm0, ymm0                                          \
+    __asm vpackuswb  ymm1, ymm1, ymm1                                          \
+    __asm vpackuswb  ymm2, ymm2, ymm2}
 
 // Store 16 ARGB values.
 #define STOREARGB_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
+  __asm {                                                                      \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                                 \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                                 \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */                \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */                 \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
     __asm lea        edx,  [edx + 64]}
 
 // Store 16 RGBA values.
 #define STORERGBA_AVX2 \
-  __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
+  __asm {                                                                      \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                                 \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                                 \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */                \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */                 \
     __asm vmovdqu    [edx], ymm0                                               \
     __asm vmovdqu    [edx + 32], ymm1                                          \
     __asm lea        edx,  [edx + 64]}
@@ -2183,6 +2258,48 @@ __declspec(naked) void I444ToARGBRow_AVX2(
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444AlphaToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+  push       esi
+  push       edi
+  push       ebx
+  push       ebp
+  mov        eax, [esp + 16 + 4]  // Y
+  mov        esi, [esp + 16 + 8]  // U
+  mov        edi, [esp + 16 + 12]  // V
+  mov        ebp, [esp + 16 + 16]  // A
+  mov        edx, [esp + 16 + 20]  // argb
+  mov        ebx, [esp + 16 + 24]  // yuvconstants
+  mov        ecx, [esp + 16 + 28]  // width
+  sub        edi, esi
+  convertloop:
+  READYUVA444_AVX2
+  YUVTORGB_AVX2(ebx)
+  STOREARGB_AVX2
+
+  sub        ecx, 16
+  jg         convertloop
+
+  pop        ebp
+  pop        ebx
+  pop        edi
+  pop        esi
+  vzeroupper
+  ret
+  }
+}
+#endif  // HAS_I444AlphaTOARGBROW_AVX2
+
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2361,191 +2478,202 @@ __declspec(naked) void I422ToRGBARow_AVX2(
 
 // Read 8 UV from 444.
 #define READYUV444 \
-  __asm {                                                     \
-    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
+// Read 4 UV from 444.  With 8 Alpha.
+#define READYUVA444 \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* U */                             \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
+    __asm lea        ebp, [ebp + 8]}
+
 // Read 4 UV from 422, upsample to 8 UV.
 #define READYUV422 \
-  __asm {                                                     \
-    __asm movd       xmm0, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
+  __asm {                                                                      \
+    __asm movd       xmm3, [esi] /* U */                                       \
+    __asm movd       xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422 \
-  __asm {                                                    \
-    __asm movd       xmm0, [esi] /* U */                              \
-    __asm movd       xmm1, [esi + edi] /* V */                              \
+  __asm {                                                                      \
+    __asm movd       xmm3, [esi] /* U */                                       \
+    __asm movd       xmm1, [esi + edi] /* V */                                 \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
+    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                             \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
     __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
 #define READNV12 \
-  __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 VU from NV21, upsample to 8 UV.
 #define READNV21 \
-  __asm {                                                       \
-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+  __asm {                                                                      \
+    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm pshufb     xmm3, xmmword ptr kShuffleNV21                            \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]}
 
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
 #define READYUY2 \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
+  __asm {                                                                      \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                                    \
     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax] /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm movdqu     xmm3, [eax] /* UV */                                      \
+    __asm pshufb     xmm3, xmmword ptr kShuffleYUY2UV                          \
     __asm lea        eax, [eax + 16]}
 
 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
 #define READUYVY \
-  __asm {                                                       \
-    __asm movdqu     xmm4, [eax] /* UYVY */                           \
+  __asm {                                                                      \
+    __asm movdqu     xmm4, [eax] /* UYVY */                                    \
     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax] /* UV */                             \
-    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm movdqu     xmm3, [eax] /* UV */                                      \
+    __asm pshufb     xmm3, xmmword ptr kShuffleUYVYUV                          \
     __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB(YuvConstants) \
-  __asm {                                         \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm movdqa     xmm3, xmm0                                                \
-    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
-    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm psubw      xmm0, xmm1                                                \
-    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
-    __asm psubw      xmm1, xmm2                                                \
-    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
-    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm psubw      xmm2, xmm3                                                \
+  __asm {                                                                      \
+    __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm pmaddubsw  xmm0, xmm3                                                \
+    __asm pmaddubsw  xmm1, xmm3                                                \
+    __asm pmaddubsw  xmm2, xmm3                                                \
+    __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm paddw      xmm4, xmm3                                                \
+    __asm paddsw     xmm0, xmm4                                                \
+    __asm paddsw     xmm2, xmm4                                                \
+    __asm psubsw     xmm4, xmm1                                                \
+    __asm movdqa     xmm1, xmm4                                                \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0 /* B */                              \
-    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm0, xmm0 /* B */                                        \
+    __asm packuswb   xmm1, xmm1 /* G */                                        \
     __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
 #define STOREARGB \
-  __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
+  __asm {                                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
 #define STOREBGRA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
+  __asm {                                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                                       \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                                       \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */                      \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
 #define STORERGBA \
-  __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
+  __asm {                                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                                       \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                                       \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */                      \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */                       \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
 #define STORERGB24 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+  __asm {/* Weave into RRGB */                                                 \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */   \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */           \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */                     \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */       \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */                \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                          \
     __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
 #define STORERGB565 \
-  __asm {/* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
+  __asm {/* Weave into RRGB */                                                 \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0 /* G */                                     \
-    __asm pslld      xmm0, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm0, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm0, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm0, xmm3 /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1 /* G */                                     \
-    __asm pslld      xmm1, 8 /* R */                                     \
-    __asm psrld      xmm3, 3 /* B */                                     \
-    __asm psrld      xmm2, 5 /* G */                                     \
-    __asm psrad      xmm1, 16 /* R */                                     \
-    __asm pand       xmm3, xmm5 /* B */                                     \
-    __asm pand       xmm2, xmm6 /* G */                                     \
-    __asm pand       xmm1, xmm7 /* R */                                     \
-    __asm por        xmm3, xmm2 /* BG */                                    \
-    __asm por        xmm1, xmm3 /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */  \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */                \
+    __asm movdqa     xmm2, xmm0 /* G */                                        \
+    __asm pslld      xmm0, 8 /* R */                                           \
+    __asm psrld      xmm3, 3 /* B */                                           \
+    __asm psrld      xmm2, 5 /* G */                                           \
+    __asm psrad      xmm0, 16 /* R */                                          \
+    __asm pand       xmm3, xmm5 /* B */                                        \
+    __asm pand       xmm2, xmm6 /* G */                                        \
+    __asm pand       xmm0, xmm7 /* R */                                        \
+    __asm por        xmm3, xmm2 /* BG */                                       \
+    __asm por        xmm0, xmm3 /* BGR */                                      \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */                 \
+    __asm movdqa     xmm2, xmm1 /* G */                                        \
+    __asm pslld      xmm1, 8 /* R */                                           \
+    __asm psrld      xmm3, 3 /* B */                                           \
+    __asm psrld      xmm2, 5 /* G */                                           \
+    __asm psrad      xmm1, 16 /* R */                                          \
+    __asm pand       xmm3, xmm5 /* B */                                        \
+    __asm pand       xmm2, xmm6 /* G */                                        \
+    __asm pand       xmm1, xmm7 /* R */                                        \
+    __asm por        xmm3, xmm2 /* BG */                                       \
+    __asm por        xmm1, xmm3 /* BGR */                                      \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */               \
     __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
@@ -2586,6 +2714,46 @@ __declspec(naked) void I444ToARGBRow_SSSE3(
 }
 
 // 8 pixels.
+// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
+__declspec(naked) void I444AlphaToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUVA444
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
 __declspec(naked) void I422ToRGB24Row_SSSE3(
     const uint8_t* y_buf,
@@ -2623,6 +2791,44 @@ __declspec(naked) void I422ToRGB24Row_SSSE3(
   }
 }
 
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I444ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV444
+    YUVTORGB(ebx)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
 __declspec(naked) void I422ToRGB565Row_SSSE3(
@@ -2898,10 +3104,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
 }
 #endif  // HAS_I422TOARGBROW_SSSE3
 
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                                           uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
                                           int width) {
   __asm {
     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +3157,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
 // note: vpunpcklbw mutates and vpackuswb unmutates.
 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                                           uint8_t* rgb_buf,
+                                          const struct YuvConstants*,
                                           int width) {
   __asm {
     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3254,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                                              uint8_t* dst_u,
+                                              uint8_t* dst_v,
+                                              int width) {
   __asm {
     push      edi
     mov       eax, [esp + 4 + 4]  // src
@@ -3078,7 +3287,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
     ret
   }
 }
-#endif  // HAS_MIRRORUVROW_SSSE3
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
@@ -3254,17 +3463,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]  // read 32 U's
-    vmovdqu    ymm1, [eax + edx]  // and 32 V's
-    lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0  // bytes 0..15
-    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
-    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
-    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
-    lea        edi, [edi + 64]
-    sub        ecx, 32
+    vpmovzxbw  ymm0, [eax]
+    vpmovzxbw  ymm1, [eax + edx]
+    lea        eax,  [eax + 16]
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm2, ymm1, ymm0
+    vmovdqu    [edi], ymm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -4172,13 +4378,13 @@ static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                                           const uint8_t* src_argb1,
                                           uint8_t* dst_argb,
                                           int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4267,7 +4473,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                                               uint8_t* dst_argb,
                                               int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
@@ -4312,7 +4518,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                                              uint8_t* dst_argb,
                                              int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -4406,7 +4612,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                                                uint8_t* dst_argb,
                                                int width) {
   __asm {
-    mov        eax, [esp + 4]  // src_argb0
+    mov        eax, [esp + 4]  // src_argb
     mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
@@ -4762,20 +4968,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
@@ -4783,8 +4989,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
     punpckhbw  xmm1, xmm1  // next 2
     punpcklbw  xmm2, xmm5  // first 2
     punpckhbw  xmm3, xmm5  // next 2
-    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
+    pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4802,13 +5008,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
                                        const uint8_t* src_argb1,
                                        uint8_t* dst_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
@@ -4817,11 +5023,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4832,11 +5038,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]  // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb
     lea        eax, [eax + 4]
     movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4851,23 +5057,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1  // src_argb - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4881,20 +5087,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
@@ -4902,8 +5108,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
     vpunpckhbw ymm1, ymm1, ymm1  // high 4
     vpunpcklbw ymm2, ymm3, ymm5  // low 4
     vpunpckhbw ymm3, ymm3, ymm5  // high 4
-    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4919,19 +5125,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
                                        const uint8_t* src_argb1,
                                        uint8_t* dst_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
@@ -4949,21 +5155,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                                             const uint8_t* src_argb1,
                                             uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        eax, [esp + 4 + 4]  // src_argb
     mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5450,7 +5656,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
 
             // 1 pixel loop
   l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel, 4 bytes.
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
diff --git a/files/source/scale.cc b/source/scale.cc
index ab085496..b7a602ba 100644
--- a/files/source/scale.cc
+++ b/source/scale.cc
@@ -17,6 +17,7 @@
 #include "libyuv/planar_functions.h"  // For CopyPlane
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h"  // For UVScale
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,6 +29,7 @@ static __inline int Abs(int v) {
 }
 
 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
 
 // Scale plane, 1/2
 // This is an optimized version for scaling down a plane to 1/2 of
@@ -49,7 +51,7 @@ static void ScalePlaneDown2(int src_width,
           ? ScaleRowDown2_C
           : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
                                         : ScaleRowDown2Box_C);
-  int row_stride = src_stride << 1;
+  int row_stride = src_stride * 2;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -118,21 +120,29 @@ static void ScalePlaneDown2(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN2_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
     ScaleRowDown2 =
         filtering == kFilterNone
-            ? ScaleRowDown2_Any_MMI
-            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
-                                          : ScaleRowDown2Box_Any_MMI);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+            ? ScaleRowDown2_Any_LSX
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX
+                                          : ScaleRowDown2Box_Any_LSX);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX
                                                : (filtering == kFilterLinear
-                                                      ? ScaleRowDown2Linear_MMI
-                                                      : ScaleRowDown2Box_MMI);
+                                                      ? ScaleRowDown2Linear_LSX
+                                                      : ScaleRowDown2Box_LSX);
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown2 = filtering == kFilterNone
+                        ? ScaleRowDown2_RVV
+                        : (filtering == kFilterLinear ? ScaleRowDown2Linear_RVV
+                                                      : ScaleRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -161,7 +171,7 @@ static void ScalePlaneDown2_16(int src_width,
           ? ScaleRowDown2_16_C
           : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
                                         : ScaleRowDown2Box_16_C);
-  int row_stride = src_stride << 1;
+  int row_stride = src_stride * 2;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -184,14 +194,6 @@ static void ScalePlaneDown2_16(int src_width,
                                           : ScaleRowDown2Box_16_SSE2);
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_16_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
-                                             : (filtering == kFilterLinear
-                                                    ? ScaleRowDown2Linear_16_MMI
-                                                    : ScaleRowDown2Box_16_MMI);
-  }
-#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -204,6 +206,51 @@ static void ScalePlaneDown2_16(int src_width,
   }
 }
 
+void ScalePlaneDown2_16To8(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           int scale,
+                           enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width, int scale) =
+      (src_width & 1)
+          ? (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_Odd_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C
+                                               : ScaleRowDown2Box_16To8_Odd_C))
+          : (filtering == kFilterNone
+                 ? ScaleRowDown2_16To8_C
+                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
+                                               : ScaleRowDown2Box_16To8_C));
+  int row_stride = src_stride * 2;
+  (void)dst_height;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < src_height / 2; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+  if (src_height & 1) {
+    if (!filtering) {
+      src_ptr -= src_stride;  // Point to last row.
+    }
+    ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale);
+  }
+}
+
 // Scale plane, 1/4
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
@@ -221,7 +268,7 @@ static void ScalePlaneDown4(int src_width,
   void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                         uint8_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-  int row_stride = src_stride << 2;
+  int row_stride = src_stride * 4;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -264,15 +311,20 @@ static void ScalePlaneDown4(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN4_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
     ScaleRowDown4 =
-        filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+        filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX;
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN4_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_RVV : ScaleRowDown4_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -297,7 +349,7 @@ static void ScalePlaneDown4_16(int src_width,
   void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                         uint16_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
-  int row_stride = src_stride << 2;
+  int row_stride = src_stride * 4;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -316,11 +368,6 @@ static void ScalePlaneDown4_16(int src_width,
         filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_16_MMI)
-  if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
-  }
-#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -398,6 +445,26 @@ static void ScalePlaneDown34(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_LSX;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_LSX;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX;
+    }
+    if (dst_width % 48 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_LSX;
+        ScaleRowDown34_1 = ScaleRowDown34_LSX;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX;
+      }
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
@@ -418,6 +485,17 @@ static void ScalePlaneDown34(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_RVV;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -613,6 +691,37 @@ static void ScalePlaneDown38(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN38_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_LSX;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_LSX;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_LSX;
+        ScaleRowDown38_2 = ScaleRowDown38_LSX;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_RVV;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -746,9 +855,11 @@ static void ScaleAddCols2_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ =
-        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
-        16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) *
+                               scaletbl[scaletbl_index] >>
+                           16);
   }
 }
 
@@ -768,9 +879,10 @@ static void ScaleAddCols2_16_C(int dst_width,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-                     scaletbl[boxwidth - minboxwidth] >>
-                 16;
+    int scaletbl_index = boxwidth - minboxwidth;
+    assert((scaletbl_index == 0) || (scaletbl_index == 1));
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16;
   }
 }
 
@@ -785,7 +897,7 @@ static void ScaleAddCols0_C(int dst_width,
   (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16);
   }
 }
 
@@ -800,7 +912,7 @@ static void ScaleAddCols1_C(int dst_width,
   int i;
   x >>= 16;
   for (i = 0; i < dst_width; ++i) {
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
     x += boxwidth;
   }
 }
@@ -827,14 +939,14 @@ static void ScaleAddCols1_16_C(int dst_width,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr) {
+static int ScalePlaneBox(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint8_t* src_ptr,
+                         uint8_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -848,6 +960,8 @@ static void ScalePlaneBox(int src_width,
   {
     // Allocate a row buffer of uint16_t.
     align_buffer_64(row16, src_width * 2);
+    if (!row16)
+      return 1;
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint16_t* src_ptr, uint8_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_C
@@ -886,19 +1000,24 @@ static void ScalePlaneBox(int src_width,
       }
     }
 #endif
-#if defined(HAS_SCALEADDROW_MMI)
-    if (TestCpuFlag(kCpuHasMMI)) {
-      ScaleAddRow = ScaleAddRow_Any_MMI;
-      if (IS_ALIGNED(src_width, 8)) {
-        ScaleAddRow = ScaleAddRow_MMI;
+#if defined(HAS_SCALEADDROW_LSX)
+    if (TestCpuFlag(kCpuHasLSX)) {
+      ScaleAddRow = ScaleAddRow_Any_LSX;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_LSX;
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_RVV)
+    if (TestCpuFlag(kCpuHasRVV)) {
+      ScaleAddRow = ScaleAddRow_RVV;
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint8_t* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -914,16 +1033,17 @@ static void ScalePlaneBox(int src_width,
     }
     free_aligned_buffer_64(row16);
   }
+  return 0;
 }
 
-static void ScalePlaneBox_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr) {
+static int ScalePlaneBox_16(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint16_t* src_ptr,
+                            uint16_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -937,6 +1057,8 @@ static void ScalePlaneBox_16(int src_width,
   {
     // Allocate a row buffer of uint32_t.
     align_buffer_64(row32, src_width * 4);
+    if (!row32)
+      return 1;
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint32_t* src_ptr, uint16_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
@@ -949,15 +1071,10 @@ static void ScalePlaneBox_16(int src_width,
     }
 #endif
 
-#if defined(HAS_SCALEADDROW_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
-      ScaleAddRow = ScaleAddRow_16_MMI;
-    }
-#endif
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint16_t* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -973,18 +1090,19 @@ static void ScalePlaneBox_16(int src_width,
     }
     free_aligned_buffer_64(row32);
   }
+  return 0;
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            enum FilterMode filtering) {
+static int ScalePlaneBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -993,13 +1111,15 @@ void ScalePlaneBilinearDown(int src_width,
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row buffer.
   align_buffer_64(row, src_width);
+  if (!row)
+    return 1;
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1038,14 +1158,19 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1068,13 +1193,21 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+  if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_LSX;
+    }
+  }
+#endif
   if (y > max_y) {
     y = max_y;
   }
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -1089,17 +1222,18 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
-void ScalePlaneBilinearDown_16(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               int src_stride,
-                               int dst_stride,
-                               const uint16_t* src_ptr,
-                               uint16_t* dst_ptr,
-                               enum FilterMode filtering) {
+static int ScalePlaneBilinearDown_16(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
+                                     int src_stride,
+                                     int dst_stride,
+                                     const uint16_t* src_ptr,
+                                     uint16_t* dst_ptr,
+                                     enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1108,13 +1242,15 @@ void ScalePlaneBilinearDown_16(int src_width,
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row buffer.
   align_buffer_64(row, src_width * 2);
+  if (!row)
+    return 1;
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1123,7 +1259,7 @@ void ScalePlaneBilinearDown_16(int src_width,
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_16_SSE2;
     }
@@ -1131,7 +1267,7 @@ void ScalePlaneBilinearDown_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    InterpolateRow = InterpolateRow_16_Any_SSSE3;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_16_SSSE3;
     }
@@ -1139,7 +1275,7 @@ void ScalePlaneBilinearDown_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    InterpolateRow = InterpolateRow_16_Any_AVX2;
     if (IS_ALIGNED(src_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
     }
@@ -1147,7 +1283,7 @@ void ScalePlaneBilinearDown_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
+    InterpolateRow = InterpolateRow_16_Any_NEON;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
     }
@@ -1165,7 +1301,7 @@ void ScalePlaneBilinearDown_16(int src_width,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -1180,18 +1316,19 @@ void ScalePlaneBilinearDown_16(int src_width,
     }
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t* src_ptr,
-                          uint8_t* dst_ptr,
-                          enum FilterMode filtering) {
+static int ScalePlaneBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1199,10 +1336,10 @@ void ScalePlaneBilinearUp(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1233,6 +1370,11 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1258,6 +1400,14 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+  if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_LSX;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
@@ -1265,11 +1415,6 @@ void ScalePlaneBilinearUp(int src_width,
       ScaleFilterCols = ScaleColsUp2_SSE2;
     }
 #endif
-#if defined(HAS_SCALECOLS_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_MMI;
-    }
-#endif
   }
 
   if (y > max_y) {
@@ -1277,14 +1422,16 @@ void ScalePlaneBilinearUp(int src_width,
   }
   {
     int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1292,7 +1439,9 @@ void ScalePlaneBilinearUp(int src_width,
       src += src_stride;
     }
     ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
+    if (src_height > 2) {
+      src += src_stride;
+    }
 
     for (j = 0; j < dst_height; ++j) {
       yi = y >> 16;
@@ -1300,14 +1449,16 @@ void ScalePlaneBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
         }
         if (yi != lasty) {
           ScaleFilterCols(rowptr, src, dst_width, x, dx);
           rowptr += rowstride;
           rowstride = -rowstride;
           lasty = yi;
-          src += src_stride;
+          if ((y + 65536) < max_y) {
+            src += src_stride;
+          }
         }
       }
       if (filtering == kFilterLinear) {
@@ -1321,17 +1472,355 @@ void ScalePlaneBilinearUp(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
-void ScalePlaneBilinearUp_16(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint16_t* src_ptr,
-                             uint16_t* dst_ptr,
-                             enum FilterMode filtering) {
+// Scale plane, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+static void ScalePlaneUp2_Linear(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+      ScaleRowUp2_Linear_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  (void)src_width;
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleRowUp2_Linear_RVV;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+static void ScalePlaneUp2_Bilinear(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_Any_C;
+  int x;
+
+  (void)src_width;
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+  }
+#endif
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_RVV;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+// Scale at most 14 bit plane, horizontally up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I210 to I410 and I212 to I412.
+static void ScalePlaneUp2_12_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  (void)src_width;
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale at most 12 bit plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I010 to I410 and I012 to I412.
+static void ScalePlaneUp2_12_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  (void)src_width;
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+static void ScalePlaneUp2_16_Linear(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride,
+                                    int dst_stride,
+                                    const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  (void)src_width;
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+static void ScalePlaneUp2_16_Bilinear(int src_width,
+                                      int src_height,
+                                      int dst_width,
+                                      int dst_height,
+                                      int src_stride,
+                                      int dst_stride,
+                                      const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  (void)src_width;
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+static int ScalePlaneBilinearUp_16(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1339,10 +1828,10 @@ void ScalePlaneBilinearUp_16(int src_width,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                           int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1351,7 +1840,7 @@ void ScalePlaneBilinearUp_16(int src_width,
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_16_SSE2;
     }
@@ -1359,7 +1848,7 @@ void ScalePlaneBilinearUp_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    InterpolateRow = InterpolateRow_16_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_16_SSSE3;
     }
@@ -1367,7 +1856,7 @@ void ScalePlaneBilinearUp_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    InterpolateRow = InterpolateRow_16_Any_AVX2;
     if (IS_ALIGNED(dst_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
     }
@@ -1375,7 +1864,7 @@ void ScalePlaneBilinearUp_16(int src_width,
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
+    InterpolateRow = InterpolateRow_16_Any_NEON;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
     }
@@ -1397,34 +1886,31 @@ void ScalePlaneBilinearUp_16(int src_width,
       ScaleFilterCols = ScaleColsUp2_16_SSE2;
     }
 #endif
-#if defined(HAS_SCALECOLS_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleFilterCols = ScaleColsUp2_16_MMI;
-    }
-#endif
   }
-
   if (y > max_y) {
     y = max_y;
   }
   {
     int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 31) & ~31;
-    align_buffer_64(row, kRowSize * 4);
-
-    uint16_t* rowptr = (uint16_t*)row;
-    int rowstride = kRowSize;
+    const int row_size = (dst_width + 31) & ~31;
+    align_buffer_64(row, row_size * 4);
+    int rowstride = row_size;
     int lasty = yi;
+    uint16_t* rowptr = (uint16_t*)row;
+    if (!row)
+      return 1;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
     if (src_height > 1) {
       src += src_stride;
     }
     ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
+    if (src_height > 2) {
+      src += src_stride;
+    }
 
     for (j = 0; j < dst_height; ++j) {
       yi = y >> 16;
@@ -1432,14 +1918,16 @@ void ScalePlaneBilinearUp_16(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
         }
         if (yi != lasty) {
           ScaleFilterCols(rowptr, src, dst_width, x, dx);
           rowptr += rowstride;
           rowstride = -rowstride;
           lasty = yi;
-          src += src_stride;
+          if ((y + 65536) < max_y) {
+            src += src_stride;
+          }
         }
       }
       if (filtering == kFilterLinear) {
@@ -1453,6 +1941,7 @@ void ScalePlaneBilinearUp_16(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 // Scale Plane to/from any dimensions, without interpolation.
@@ -1469,7 +1958,7 @@ static void ScalePlaneSimple(int src_width,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1487,15 +1976,11 @@ static void ScalePlaneSimple(int src_width,
       ScaleCols = ScaleColsUp2_SSE2;
     }
 #endif
-#if defined(HAS_SCALECOLS_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_MMI;
-    }
-#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1510,7 +1995,7 @@ static void ScalePlaneSimple_16(int src_width,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
                     int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -1528,15 +2013,11 @@ static void ScalePlaneSimple_16(int src_width,
       ScaleCols = ScaleColsUp2_16_SSE2;
     }
 #endif
-#if defined(HAS_SCALECOLS_16_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
-      ScaleCols = ScaleColsUp2_16_MMI;
-    }
-#endif
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1544,17 +2025,16 @@ static void ScalePlaneSimple_16(int src_width,
 
 // Scale a plane.
 // This function dispatches to a specialized scaler based on scale factor.
-
 LIBYUV_API
-void ScalePlane(const uint8_t* src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t* dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering) {
+int ScalePlane(const uint8_t* src,
+               int src_stride,
+               int src_width,
+               int src_height,
+               uint8_t* dst,
+               int dst_stride,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -1562,23 +2042,31 @@ void ScalePlane(const uint8_t* src,
   // Negative height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
     src_stride = -src_stride;
   }
-
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
+    return 0;
   }
   if (dst_width == src_width && filtering != kFilterBox) {
-    int dy = FixedDiv(src_height, dst_height);
+    int dy = 0;
+    int y = 0;
+    // When scaling down, use the center 2 rows to filter.
+    // When scaling up, last row of destination uses the last 2 source rows.
+    if (dst_height <= src_height) {
+      dy = FixedDiv(src_height, dst_height);
+      y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (src_height > 1 && dst_height > 1) {
+      dy = FixedDiv1(src_height, dst_height);
+    }
     // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
-                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
-    return;
+                       dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
+    return 0;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
@@ -1586,58 +2074,67 @@ void ScalePlane(const uint8_t* src,
       // optimized, 3/4
       ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
                        dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     // 3/8 rounded up for odd sized chroma height.
     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
                        dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
-                  dst_stride, src, dst);
-    return;
+    return ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst);
+  }
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst);
+    return 0;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst);
+    return 0;
   }
   if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                                src_stride, dst_stride, src, dst, filtering);
   }
   if (filtering) {
-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                                  src_stride, dst_stride, src, dst, filtering);
   }
   ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
                    dst_stride, src, dst);
+  return 0;
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16_t* src,
-                   int src_stride,
-                   int src_width,
-                   int src_height,
-                   uint16_t* dst,
-                   int dst_stride,
-                   int dst_width,
-                   int dst_height,
-                   enum FilterMode filtering) {
+int ScalePlane_16(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -1645,23 +2142,34 @@ void ScalePlane_16(const uint16_t* src,
   // Negative height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
     src_stride = -src_stride;
   }
-
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
-    return;
+    return 0;
   }
   if (dst_width == src_width && filtering != kFilterBox) {
-    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
+    int dy = 0;
+    int y = 0;
+    // When scaling down, use the center 2 rows to filter.
+    // When scaling up, last row of destination uses the last 2 source rows.
+    if (dst_height <= src_height) {
+      dy = FixedDiv(src_height, dst_height);
+      y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
+      // When scaling up, ensure the last row of destination uses the last
+      // source. Avoid divide by zero for dst_height but will do no scaling
+      // later.
+    } else if (src_height > 1 && dst_height > 1) {
+      dy = FixedDiv1(src_height, dst_height);
+    }
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
-                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
-    return;
+                          dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
+    return 0;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
@@ -1669,46 +2177,93 @@ void ScalePlane_16(const uint16_t* src,
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     // 3/8 rounded up for odd sized chroma height.
     if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
-      return;
+      return 0;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
-                     dst_stride, src, dst);
-    return;
+    return ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
+  }
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
+    return 0;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst);
+    return 0;
   }
   if (filtering && dst_height > src_height) {
-    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
-                            src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                                   src_stride, dst_stride, src, dst, filtering);
   }
   if (filtering) {
-    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
-                              src_stride, dst_stride, src, dst, filtering);
-    return;
+    return ScalePlaneBilinearDown_16(src_width, src_height, dst_width,
+                                     dst_height, src_stride, dst_stride, src,
+                                     dst, filtering);
   }
   ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
                       dst_stride, src, dst);
+  return 0;
+}
+
+LIBYUV_API
+int ScalePlane_12(const uint16_t* src,
+                  int src_stride,
+                  int src_width,
+                  int src_height,
+                  uint16_t* dst,
+                  int dst_stride,
+                  int dst_width,
+                  int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * (int64_t)src_stride;
+    src_stride = -src_stride;
+  }
+
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
+    return 0;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst);
+    return 0;
+  }
+
+  return ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+                       dst_width, dst_height, filtering);
 }
 
 // Scale an I420 image.
@@ -1736,19 +2291,27 @@ int I420Scale(const uint8_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                 dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                 dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -1773,19 +2336,72 @@ int I420Scale_16(const uint16_t* src_y,
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-  return 0;
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
+}
+
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 // Scale an I444 image.
@@ -1809,19 +2425,27 @@ int I444Scale(const uint8_t* src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-             dst_width, dst_height, filtering);
-  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-             dst_width, dst_height, filtering);
-  return 0;
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u,
+                 dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v,
+                 dst_stride_v, dst_width, dst_height, filtering);
+  return r;
 }
 
 LIBYUV_API
@@ -1842,19 +2466,239 @@ int I444Scale_16(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
       src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
       dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
-                dst_width, dst_height, filtering);
-  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
-                dst_width, dst_height, filtering);
-  return 0;
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u,
+                    dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v,
+                    dst_stride_v, dst_width, dst_height, filtering);
+  return r;
+}
+
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u,
+                    dst_stride_u, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v,
+                    dst_stride_v, dst_width, dst_height, filtering);
+  return r;
+}
+
+// Scale an I422 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                 dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                 dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
+}
+
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
+}
+
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
+                    dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+                    dst_stride_u, dst_halfwidth, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+                    dst_stride_v, dst_halfwidth, dst_height, filtering);
+  return r;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_uv,
+              int src_stride_uv,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_uv,
+              int dst_stride_uv,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
+                 dst_stride_y, dst_width, dst_height, filtering);
+  if (r != 0) {
+    return r;
+  }
+  r = UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+              dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+  return r;
 }
 
 // Deprecated api
diff --git a/source/scale_any.cc b/source/scale_any.cc
new file mode 100644
index 00000000..f6576874
--- /dev/null
+++ b/source/scale_any.cc
@@ -0,0 +1,1078 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>  // For memset/memcpy
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+      ScaleUVRowDown2Box_SSSE3,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      3)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+      ScaleUVRowDown2Box_AVX2,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2_NEON
+SDANY(ScaleUVRowDown2_Any_NEON,
+      ScaleUVRowDown2_NEON,
+      ScaleUVRowDown2_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
+SDANY(ScaleUVRowDown2Linear_Any_NEON,
+      ScaleUVRowDown2Linear_NEON,
+      ScaleUVRowDown2Linear_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+      ScaleUVRowDown2Box_NEON,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      7)
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_LSX
+SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_LSX,
+      ScaleRowDown2Linear_LSX,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_LSX,
+      ScaleRowDown2Box_LSX,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_LSX
+SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_LSX,
+      ScaleRowDown4Box_LSX,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+      ScaleRowDown34_MSA,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+      ScaleRowDown34_0_Box_MSA,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+      ScaleRowDown34_1_Box_MSA,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
+#endif
+#ifdef HAS_SCALEROWDOWN34_LSX
+SDANY(ScaleRowDown34_Any_LSX,
+      ScaleRowDown34_LSX,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_LSX,
+      ScaleRowDown34_0_Box_LSX,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_LSX,
+      ScaleRowDown34_1_Box_LSX,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_LSX
+SDANY(ScaleRowDown38_Any_LSX,
+      ScaleRowDown38_LSX,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_LSX,
+      ScaleRowDown38_3_Box_LSX,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_LSX,
+      ScaleRowDown38_2_Box_LSX,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_LSX
+SDANY(ScaleARGBRowDown2_Any_LSX,
+      ScaleARGBRowDown2_LSX,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_LSX,
+      ScaleARGBRowDown2Linear_LSX,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_LSX,
+      ScaleARGBRowDown2Box_LSX,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX
+SDAANY(ScaleARGBRowDownEven_Any_LSX,
+       ScaleARGBRowDownEven_LSX,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_LSX,
+       ScaleARGBRowDownEvenBox_LSX,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+       ScaleUVRowDownEven_NEON,
+       ScaleUVRowDownEven_C,
+       2,
+       3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down.  Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
+    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
+    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
+    }                                                                  \
+    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
+    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
+    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
+    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_LSX
+SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15)
+#endif
+#undef SAANY
+
+#else
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_LSX
+SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#endif  // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_LSX
+CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_LSX
+CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_LSX
+CANY(ScaleARGBFilterCols_Any_LSX,
+     ScaleARGBFilterCols_LSX,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#undef CANY
+
+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE)                       \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+    int work_width = (dst_width - 1) & ~1;                         \
+    int r = work_width & MASK;                                     \
+    int n = work_width & ~MASK;                                    \
+    dst_ptr[0] = src_ptr[0];                                       \
+    if (work_width > 0) {                                          \
+      if (n != 0) {                                                \
+        SIMD(src_ptr, dst_ptr + 1, n);                             \
+      }                                                            \
+      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
+    }                                                              \
+    dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];         \
+  }
+
+// Even the C versions need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+         ScaleRowUp2_Linear_C,
+         ScaleRowUp2_Linear_C,
+         0,
+         uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+         ScaleRowUp2_Linear_16_C,
+         ScaleRowUp2_Linear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+         ScaleRowUp2_Linear_SSE2,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+         ScaleRowUp2_Linear_SSSE3,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
+         ScaleRowUp2_Linear_12_SSSE3,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+         ScaleRowUp2_Linear_16_SSE2,
+         ScaleRowUp2_Linear_16_C,
+         7,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+         ScaleRowUp2_Linear_AVX2,
+         ScaleRowUp2_Linear_C,
+         31,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
+         ScaleRowUp2_Linear_12_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         31,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+         ScaleRowUp2_Linear_16_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
+         ScaleRowUp2_Linear_NEON,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
+         ScaleRowUp2_Linear_12_NEON,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
+         ScaleRowUp2_Linear_16_NEON,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                              \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr,   \
+            ptrdiff_t dst_stride, int dst_width) {                        \
+    int work_width = (dst_width - 1) & ~1;                                \
+    int r = work_width & MASK;                                            \
+    int n = work_width & ~MASK;                                           \
+    const PTYPE* sa = src_ptr;                                            \
+    const PTYPE* sb = src_ptr + src_stride;                               \
+    PTYPE* da = dst_ptr;                                                  \
+    PTYPE* db = dst_ptr + dst_stride;                                     \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
+    if (work_width > 0) {                                                 \
+      if (n != 0) {                                                       \
+        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
+      }                                                                   \
+      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
+    }                                                                     \
+    da[dst_width - 1] =                                                   \
+        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
+    db[dst_width - 1] =                                                   \
+        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
+  }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+         ScaleRowUp2_Bilinear_C,
+         ScaleRowUp2_Bilinear_C,
+         0,
+         uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+         ScaleRowUp2_Bilinear_16_C,
+         ScaleRowUp2_Bilinear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+         ScaleRowUp2_Bilinear_SSE2,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
+         ScaleRowUp2_Bilinear_12_SSSE3,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+         ScaleRowUp2_Bilinear_16_SSE2,
+         ScaleRowUp2_Bilinear_16_C,
+         7,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+         ScaleRowUp2_Bilinear_SSSE3,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+         ScaleRowUp2_Bilinear_AVX2,
+         ScaleRowUp2_Bilinear_C,
+         31,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
+         ScaleRowUp2_Bilinear_12_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+         ScaleRowUp2_Bilinear_16_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
+         ScaleRowUp2_Bilinear_NEON,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
+         ScaleRowUp2_Bilinear_12_NEON,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
+         ScaleRowUp2_Bilinear_16_NEON,
+         ScaleRowUp2_Bilinear_16_C,
+         7,
+         uint16_t)
+#endif
+
+#undef SU2BLANY
+
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
+    int work_width = (dst_width - 1) & ~1;                            \
+    int r = work_width & MASK;                                        \
+    int n = work_width & ~MASK;                                       \
+    dst_ptr[0] = src_ptr[0];                                          \
+    dst_ptr[1] = src_ptr[1];                                          \
+    if (work_width > 0) {                                             \
+      if (n != 0) {                                                   \
+        SIMD(src_ptr, dst_ptr + 2, n);                                \
+      }                                                               \
+      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
+    }                                                                 \
+    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+  }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+          ScaleUVRowUp2_Linear_C,
+          ScaleUVRowUp2_Linear_C,
+          0,
+          uint8_t)
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
+          ScaleUVRowUp2_Linear_16_C,
+          ScaleUVRowUp2_Linear_16_C,
+          0,
+          uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+          ScaleUVRowUp2_Linear_SSSE3,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+          ScaleUVRowUp2_Linear_AVX2,
+          ScaleUVRowUp2_Linear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
+          ScaleUVRowUp2_Linear_16_SSE41,
+          ScaleUVRowUp2_Linear_16_C,
+          3,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
+          ScaleUVRowUp2_Linear_16_AVX2,
+          ScaleUVRowUp2_Linear_16_C,
+          7,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+          ScaleUVRowUp2_Linear_NEON,
+          ScaleUVRowUp2_Linear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
+          ScaleUVRowUp2_Linear_16_NEON,
+          ScaleUVRowUp2_Linear_16_C,
+          15,
+          uint16_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+            ptrdiff_t dst_stride, int dst_width) {                      \
+    int work_width = (dst_width - 1) & ~1;                              \
+    int r = work_width & MASK;                                          \
+    int n = work_width & ~MASK;                                         \
+    const PTYPE* sa = src_ptr;                                          \
+    const PTYPE* sb = src_ptr + src_stride;                             \
+    PTYPE* da = dst_ptr;                                                \
+    PTYPE* db = dst_ptr + dst_stride;                                   \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                               \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                               \
+    da[1] = (3 * sa[1] + sb[1] + 2) >> 2;                               \
+    db[1] = (sa[1] + 3 * sb[1] + 2) >> 2;                               \
+    if (work_width > 0) {                                               \
+      if (n != 0) {                                                     \
+        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
+      }                                                                 \
+      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
+    }                                                                   \
+    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
+                             sb[((dst_width + 1) & ~1) - 2] + 2) >>     \
+                            2;                                          \
+    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+                            2;                                          \
+    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
+                             sb[((dst_width + 1) & ~1) - 1] + 2) >>     \
+                            2;                                          \
+    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+                            2;                                          \
+  }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+          ScaleUVRowUp2_Bilinear_C,
+          ScaleUVRowUp2_Bilinear_C,
+          0,
+          uint8_t)
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          0,
+          uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+          ScaleUVRowUp2_Bilinear_SSSE3,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_AVX2,
+          ScaleUVRowUp2_Bilinear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
+          ScaleUVRowUp2_Bilinear_16_SSE41,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_16_AVX2,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+          ScaleUVRowUp2_Bilinear_NEON,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
+          ScaleUVRowUp2_Bilinear_16_NEON,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+#endif
+
+#undef SBU2BLANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_argb.cc b/source/scale_argb.cc
index beef380a..18bdeb86 100644
--- a/files/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -16,6 +16,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -58,9 +59,9 @@ static void ScaleARGBDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   } else {
-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -111,22 +112,31 @@ static void ScaleARGBDown2(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBROWDOWN2_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEARGBROWDOWN2_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
     ScaleARGBRowDown2 =
         filtering == kFilterNone
-            ? ScaleARGBRowDown2_Any_MMI
-            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
-                                          : ScaleARGBRowDown2Box_Any_MMI);
-    if (IS_ALIGNED(dst_width, 2)) {
+            ? ScaleARGBRowDown2_Any_LSX
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX
+                                          : ScaleARGBRowDown2Box_Any_LSX);
+    if (IS_ALIGNED(dst_width, 4)) {
       ScaleARGBRowDown2 =
           filtering == kFilterNone
-              ? ScaleARGBRowDown2_MMI
-              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
-                                            : ScaleARGBRowDown2Box_MMI);
+              ? ScaleARGBRowDown2_LSX
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX
+                                            : ScaleARGBRowDown2Box_LSX);
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_RVV
+                                          : ScaleARGBRowDown2Box_RVV);
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -141,28 +151,33 @@ static void ScaleARGBDown2(int src_width,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              int src_stride,
-                              int dst_stride,
-                              const uint8_t* src_argb,
-                              uint8_t* dst_argb,
-                              int x,
-                              int dx,
-                              int y,
-                              int dy) {
+static int ScaleARGBDown4Box(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int x,
+                             int dx,
+                             int y,
+                             int dy) {
   int j;
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const int row_size = (dst_width * 2 * 4 + 31) & ~31;
+  // TODO(fbarchard): Remove this row buffer and implement a ScaleARGBRowDown4
+  // but implemented via a 2 pass wrapper that uses a very small array on the
+  // stack with a horizontal loop.
+  align_buffer_64(row, row_size * 2);
+  if (!row)
+    return 1;
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                             uint8_t* dst_argb, int dst_width) =
       ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -184,16 +199,22 @@ static void ScaleARGBDown4Box(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_RVV;
+  }
+#endif
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size,
                       dst_width * 2);
-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    ScaleARGBRowDown2(row, row_size, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
   }
   free_aligned_buffer_64(row);
+  return 0;
 }
 
 // ScaleARGB ARGB Even
@@ -214,7 +235,7 @@ static void ScaleARGBDownEven(int src_width,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -222,7 +243,7 @@ static void ScaleARGBDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -253,16 +274,26 @@ static void ScaleARGBDownEven(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
-                                     : ScaleARGBRowDownEven_Any_MMI;
-    if (IS_ALIGNED(dst_width, 2)) {
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX
+                                     : ScaleARGBRowDownEven_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
       ScaleARGBRowDownEven =
-          filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+          filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX;
     }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWNEVENBOX_RVV)
+  if (filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEvenBox_RVV;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_RVV)
+  if (!filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_RVV;
+  }
+#endif
 
   if (filtering == kFilterLinear) {
     src_stride = 0;
@@ -275,24 +306,24 @@ static void ScaleARGBDownEven(int src_width,
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  int src_stride,
-                                  int dst_stride,
-                                  const uint8_t* src_argb,
-                                  uint8_t* dst_argb,
-                                  int x,
-                                  int dx,
-                                  int y,
-                                  int dy,
-                                  enum FilterMode filtering) {
+static int ScaleARGBBilinearDown(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
+                                 int x,
+                                 int dx,
+                                 int y,
+                                 int dy,
+                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
   int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
@@ -340,6 +371,19 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -361,10 +405,20 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+    }
+  }
+#endif
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of ARGB.
   {
     align_buffer_64(row, clip_src_width * 4);
+    if (!row)
+      return 1;
 
     const int max_y = (src_height - 1) << 16;
     if (y > max_y) {
@@ -372,7 +426,7 @@ static void ScaleARGBBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -388,27 +442,28 @@ static void ScaleARGBBilinearDown(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                int src_stride,
-                                int dst_stride,
-                                const uint8_t* src_argb,
-                                uint8_t* dst_argb,
-                                int x,
-                                int dx,
-                                int y,
-                                int dy,
-                                enum FilterMode filtering) {
+static int ScaleARGBBilinearUp(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_argb,
+                               uint8_t* dst_argb,
+                               int x,
+                               int dx,
+                               int y,
+                               int dy,
+                               enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
@@ -444,14 +499,19 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(dst_width, 2)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_LSX;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
   if (src_width >= 32768) {
     ScaleARGBFilterCols =
         filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
@@ -477,6 +537,14 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+  if (filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -498,11 +566,11 @@ static void ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+  if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_LSX;
     }
   }
 #endif
@@ -513,11 +581,6 @@ static void ScaleARGBBilinearUp(int src_width,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
   }
 
   if (y > max_y) {
@@ -526,14 +589,16 @@ static void ScaleARGBBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+    const int row_size = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
 
     uint8_t* rowptr = row;
-    int rowstride = kRowSize;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -541,7 +606,9 @@ static void ScaleARGBBilinearUp(int src_width,
       src += src_stride;
     }
     ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-    src += src_stride;
+    if (src_height > 2) {
+      src += src_stride;
+    }
 
     for (j = 0; j < dst_height; ++j) {
       yi = y >> 16;
@@ -549,14 +616,16 @@ static void ScaleARGBBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_argb + yi * src_stride;
+          src = src_argb + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
           rowptr += rowstride;
           rowstride = -rowstride;
           lasty = yi;
-          src += src_stride;
+          if ((y + 65536) < max_y) {
+            src += src_stride;
+          }
         }
       }
       if (filtering == kFilterLinear) {
@@ -570,27 +639,28 @@ static void ScaleARGBBilinearUp(int src_width,
     }
     free_aligned_buffer_64(row);
   }
+  return 0;
 }
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width,
-                                     int src_height,
-                                     int dst_width,
-                                     int dst_height,
-                                     int src_stride_y,
-                                     int src_stride_u,
-                                     int src_stride_v,
-                                     int dst_stride_argb,
-                                     const uint8_t* src_y,
-                                     const uint8_t* src_u,
-                                     const uint8_t* src_v,
-                                     uint8_t* dst_argb,
-                                     int x,
-                                     int dx,
-                                     int y,
-                                     int dy,
-                                     enum FilterMode filtering) {
+static int ScaleYUVToARGBBilinearUp(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride_y,
+                                    int src_stride_u,
+                                    int src_stride_v,
+                                    int dst_stride_argb,
+                                    const uint8_t* src_y,
+                                    const uint8_t* src_u,
+                                    const uint8_t* src_v,
+                                    uint8_t* dst_argb,
+                                    int x,
+                                    int dx,
+                                    int y,
+                                    int dy,
+                                    enum FilterMode filtering) {
   int j;
   void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
                         const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
@@ -611,6 +681,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_I422TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
@@ -627,8 +706,29 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -663,8 +763,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
-  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
@@ -692,6 +805,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+  if (filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -713,11 +834,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (!filtering && TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+  if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_LSX;
     }
   }
 #endif
@@ -728,11 +849,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
   }
 
   const int max_y = (src_height - 1) << 16;
@@ -742,20 +858,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8_t* src_row_y = src_y + yi * src_stride_y;
-  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
-
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 31) & ~31;
-  align_buffer_64(row, kRowSize * 2);
+  const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
 
-  // Allocate 1 row of ARGB for source conversion.
-  align_buffer_64(argb_row, src_width * 4);
+  // Allocate 1 row of ARGB for source conversion and 2 rows of ARGB
+  // scaled horizontally to the destination width.
+  const int row_size = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2 + src_width * 4);
 
+  uint8_t* argb_row = row + row_size * 2;
   uint8_t* rowptr = row;
-  int rowstride = kRowSize;
+  int rowstride = row_size;
   int lasty = yi;
+  if (!row)
+    return 1;
 
   // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
   ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
@@ -782,9 +899,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
         y = max_y;
         yi = y >> 16;
         uv_yi = yi >> kYShift;
-        src_row_y = src_y + yi * src_stride_y;
-        src_row_u = src_u + uv_yi * src_stride_u;
-        src_row_v = src_v + uv_yi * src_stride_v;
+        src_row_y = src_y + yi * (intptr_t)src_stride_y;
+        src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+        src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
       }
       if (yi != lasty) {
         // TODO(fbarchard): Convert the clipped region of row.
@@ -810,7 +927,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     y += dy;
   }
   free_aligned_buffer_64(row);
-  free_aligned_buffer_64(row_argb);
+  return 0;
 }
 #endif
 
@@ -832,7 +949,7 @@ static void ScaleARGBSimple(int src_width,
                             int y,
                             int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                         int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
   (void)src_height;
@@ -857,11 +974,11 @@ static void ScaleARGBSimple(int src_width,
     }
   }
 #endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    ScaleARGBCols = ScaleARGBCols_Any_MMI;
-    if (IS_ALIGNED(dst_width, 1)) {
-      ScaleARGBCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBCols = ScaleARGBCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBCols_LSX;
     }
   }
 #endif
@@ -872,16 +989,11 @@ static void ScaleARGBSimple(int src_width,
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
-    if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBCols = ScaleARGBColsUp2_MMI;
-    }
-#endif
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
-                  dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
+                  dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -890,19 +1002,19 @@ static void ScaleARGBSimple(int src_width,
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8_t* src,
-                      int src_stride,
-                      int src_width,
-                      int src_height,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int dst_width,
-                      int dst_height,
-                      int clip_x,
-                      int clip_y,
-                      int clip_width,
-                      int clip_height,
-                      enum FilterMode filtering) {
+static int ScaleARGB(const uint8_t* src,
+                     int src_stride,
+                     int src_width,
+                     int src_height,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int dst_width,
+                     int dst_height,
+                     int clip_x,
+                     int clip_y,
+                     int clip_width,
+                     int clip_height,
+                     enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -916,7 +1028,7 @@ static void ScaleARGB(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -931,7 +1043,7 @@ static void ScaleARGB(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * src_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
     dst += clip_y * dst_stride;
   }
 
@@ -947,51 +1059,50 @@ static void ScaleARGB(const uint8_t* src,
           ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
                          src_stride, dst_stride, src, dst, x, dx, y, dy,
                          filtering);
-          return;
+          return 0;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
-                            src_stride, dst_stride, src, dst, x, dx, y, dy);
-          return;
+          return ScaleARGBDown4Box(src_width, src_height, clip_width,
+                                   clip_height, src_stride, dst_stride, src,
+                                   dst, x, dx, y, dy);
         }
         ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
                           src_stride, dst_stride, src, dst, x, dx, y, dy,
                           filtering);
-        return;
+        return 0;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
       if ((dx & 0x10000) && (dy & 0x10000)) {
         filtering = kFilterNone;
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
-                   dst, dst_stride, clip_width, clip_height);
-          return;
+          ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
+                   src_stride, dst, dst_stride, clip_width, clip_height);
+          return 0;
         }
       }
     }
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
-                       dst_stride, src, dst, x, y, dy, 4, filtering);
-    return;
+                       dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering);
+    return 0;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
-                        src_stride, dst_stride, src, dst, x, dx, y, dy,
-                        filtering);
-    return;
+    return ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                               src_stride, dst_stride, src, dst, x, dx, y, dy,
+                               filtering);
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
-                          src_stride, dst_stride, src, dst, x, dx, y, dy,
-                          filtering);
-    return;
+    return ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                                 src_stride, dst_stride, src, dst, x, dx, y, dy,
+                                 filtering);
   }
   ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
                   dst_stride, src, dst, x, dx, y, dy);
+  return 0;
 }
 
 LIBYUV_API
@@ -1015,10 +1126,9 @@ int ARGBScaleClip(const uint8_t* src_argb,
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
-            clip_height, filtering);
-  return 0;
+  return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+                   dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                   clip_width, clip_height, filtering);
 }
 
 // Scale an ARGB image.
@@ -1036,10 +1146,9 @@ int ARGBScale(const uint8_t* src_argb,
       src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
-            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
-            filtering);
-  return 0;
+  return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+                   dst_stride_argb, dst_width, dst_height, 0, 0, dst_width,
+                   dst_height, filtering);
 }
 
 // Scale with YUV conversion to ARGB and clipping.
@@ -1063,8 +1172,11 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
                        int clip_width,
                        int clip_height,
                        enum FilterMode filtering) {
-  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
   int r;
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+  if (!argb_buffer) {
+    return 1;  // Out of memory runtime error.
+  }
   (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
   (void)dst_fourcc;
   I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
diff --git a/files/source/scale_common.cc b/source/scale_common.cc
index 63690271..d07a39af 100644
--- a/files/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -23,6 +23,25 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#ifdef __cplusplus
+#define STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#define STATIC_CAST(type, expr) (type)(expr)
+#endif
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+  return (-(v >= 255) | v) & 255;
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
@@ -62,6 +81,50 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2_16To8_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width,
+                           int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+  }
+}
+
+void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width,
+                               int scale) {
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale));
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale));
+    dst += 1;
+    src_ptr += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale));
+}
+
 void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
@@ -98,6 +161,52 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst,
+                                 int dst_width,
+                                 int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+  }
+}
+
+void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst,
+                                     int dst_width,
+                                     int scale) {
+  const uint16_t* s = src_ptr;
+  int x;
+  (void)src_stride;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale));
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale));
+    dst += 1;
+    s += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale));
+}
+
 void ScaleRowDown2Box_C(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst,
@@ -160,6 +269,61 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
   }
 }
 
+void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width,
+                              int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+  }
+}
+
+void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst,
+                                  int dst_width,
+                                  int scale) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst[1] = STATIC_CAST(uint8_t,
+                         C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale));
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = STATIC_CAST(uint8_t,
+                         C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale));
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale));
+}
+
 void ScaleRowDown4_C(const uint8_t* src_ptr,
                      ptrdiff_t src_stride,
                      uint8_t* dst,
@@ -400,6 +564,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
   }
 }
 
+// Sample position: (O is src sample position, X is dst sample position)
+//
+//      v dst_ptr at here           v stop at here
+//  X O X   X O X   X O X   X O X   X O X
+//    ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+  }
+}
+
+// Sample position: (O is src sample position, X is dst sample position)
+//
+//    src_ptr at here
+//  X v X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+//      ^ dst_ptr at here           ^ stop at here
+//  X   X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[2 * x + 0] =
+        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+    d[2 * x + 1] =
+        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 0] =
+        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 1] =
+        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+  }
+}
+
+// Only suitable for at most 14 bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+  }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  uint16_t* d = dst_ptr;
+  uint16_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[2 * x + 0] =
+        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+    d[2 * x + 1] =
+        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 0] =
+        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 1] =
+        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleCols_C(uint8_t* dst_ptr,
                  const uint8_t* src_ptr,
@@ -677,18 +930,18 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
         (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
          src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
          src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-            (65536 / 9) >>
+            (65536u / 9u) >>
         16;
     dst_ptr[1] =
         (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
          src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
          src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-            (65536 / 9) >>
+            (65536u / 9u) >>
         16;
     dst_ptr[2] =
         (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
          src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-            (65536 / 6) >>
+            (65536u / 6u) >>
         16;
     src_ptr += 8;
     dst_ptr += 3;
@@ -731,15 +984,15 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
   for (i = 0; i < dst_width; i += 3) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
                   src_ptr[stride + 1] + src_ptr[stride + 2]) *
-                     (65536 / 6) >>
+                     (65536u / 6u) >>
                  16;
     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
                   src_ptr[stride + 4] + src_ptr[stride + 5]) *
-                     (65536 / 6) >>
+                     (65536u / 6u) >>
                  16;
     dst_ptr[2] =
         (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
-            (65536 / 4) >>
+            (65536u / 4u) >>
         16;
     src_ptr += 8;
     dst_ptr += 3;
@@ -776,6 +1029,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
   }
 }
 
+// ARGB scale row functions
+
 void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
                          uint8_t* dst_argb,
@@ -1018,6 +1273,346 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
 #undef BLENDERC
 #undef BLENDER
 
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_uv,
+                       int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = src_uv[2];  // Store the 2nd UV
+    dst_uv[1] = src_uv[3];
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  (void)src_stride;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += 4;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t* dst_uv,
+                          int dst_width) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+                             ptrdiff_t src_stride,
+                             int src_stepx,
+                             uint8_t* dst_uv,
+                             int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                 src_uv[src_stride + 2] + 2) >>
+                2;
+    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                 src_uv[src_stride + 3] + 2) >>
+                2;
+    src_uv += src_stepx * 2;
+    dst_uv += 2;
+  }
+}
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[4 * x + 0] =
+        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 1] =
+        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 2] =
+        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+    dst_ptr[4 * x + 3] =
+        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+  }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 1 + 8) >>
+                   4;
+    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 1 + 8) >>
+                   4;
+    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 9 + 8) >>
+                   4;
+    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 9 + 8) >>
+                   4;
+  }
+}
+
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[4 * x + 0] =
+        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 1] =
+        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 2] =
+        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+    dst_ptr[4 * x + 3] =
+        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+  }
+}
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  uint16_t* d = dst_ptr;
+  uint16_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 1 + 8) >>
+                   4;
+    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 1 + 8) >>
+                   4;
+    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 9 + 8) >>
+                   4;
+    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 9 + 8) >>
+                   4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+                   const uint8_t* src_uv,
+                   int dst_width,
+                   int x,
+                   int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+                     const uint8_t* src_uv,
+                     int dst_width,
+                     int x32,
+                     int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+                      const uint8_t* src_uv,
+                      int dst_width,
+                      int x,
+                      int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  (void)x;
+  (void)dx;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+                         const uint8_t* src_uv,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+                           const uint8_t* src_uv,
+                           int dst_width,
+                           int x32,
+                           int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64_t xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint16_t a = src[xi];
+    uint16_t b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
                         int dst_width,
@@ -1029,11 +1624,11 @@ void ScalePlaneVertical(int src_height,
                         int x,
                         int y,
                         int dy,
-                        int bpp,
+                        int bpp,  // bytes per pixel. 4 for ARGB.
                         enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1075,14 +1670,20 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    InterpolateRow = InterpolateRow_Any_MMI;
-    if (IS_ALIGNED(dst_width_bytes, 8)) {
-      InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
@@ -1097,6 +1698,7 @@ void ScalePlaneVertical(int src_height,
     y += dy;
   }
 }
+
 void ScalePlaneVertical_16(int src_height,
                            int dst_width,
                            int dst_height,
@@ -1107,11 +1709,11 @@ void ScalePlaneVertical_16(int src_height,
                            int x,
                            int y,
                            int dy,
-                           int wpp,
+                           int wpp, /* words per pixel. normally 1 */
                            enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1123,32 +1725,32 @@ void ScalePlaneVertical_16(int src_height,
   src_argb += (x >> 16) * wpp;
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    InterpolateRow = InterpolateRow_Any_16_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(dst_width_words, 16)) {
       InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    InterpolateRow = InterpolateRow_Any_16_SSSE3;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
+    InterpolateRow = InterpolateRow_16_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_words, 16)) {
       InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    InterpolateRow = InterpolateRow_Any_16_AVX2;
-    if (IS_ALIGNED(dst_width_bytes, 32)) {
+    InterpolateRow = InterpolateRow_16_Any_AVX2;
+    if (IS_ALIGNED(dst_width_words, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    InterpolateRow = InterpolateRow_Any_16_NEON;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
+    InterpolateRow = InterpolateRow_16_Any_NEON;
+    if (IS_ALIGNED(dst_width_words, 8)) {
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
@@ -1168,6 +1770,70 @@ void ScalePlaneVertical_16(int src_height,
   }
 }
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+void ScalePlaneVertical_16To8(int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint16_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int y,
+                              int dy,
+                              int wpp, /* words per pixel. normally 1 */
+                              int scale,
+                              enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
+  void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
+                               ptrdiff_t src_stride, int scale, int dst_width,
+                               int source_y_fraction) = InterpolateRow_16To8_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+
+#if defined(HAS_INTERPOLATEROW_16TO8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow_16To8 = InterpolateRow_16To8_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow_16To8 = InterpolateRow_16To8_AVX2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride,
+                         scale, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width,
                                   int src_height,
@@ -1181,8 +1847,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
     src_height = -src_height;
   }
   if (filtering == kFilterBox) {
-    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
-    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+    // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
       filtering = kFilterBilinear;
     }
   }
@@ -1217,7 +1883,7 @@ int FixedDiv_C(int num, int div) {
   return (int)(((int64_t)(num) << 16) / div);
 }
 
-// Divide num by div and return as 16.16 fixed point result.
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div) {
   return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
 }
@@ -1260,14 +1926,14 @@ void ScaleSlope(int src_width,
     if (dst_width <= Abs(src_width)) {
       *dx = FixedDiv(Abs(src_width), dst_width);
       *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
+    } else if (src_width > 1 && dst_width > 1) {
       *dx = FixedDiv1(Abs(src_width), dst_width);
       *x = 0;
     }
     if (dst_height <= src_height) {
       *dy = FixedDiv(src_height, dst_height);
       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_height > 1) {
+    } else if (src_height > 1 && dst_height > 1) {
       *dy = FixedDiv1(src_height, dst_height);
       *y = 0;
     }
@@ -1276,7 +1942,7 @@ void ScaleSlope(int src_width,
     if (dst_width <= Abs(src_width)) {
       *dx = FixedDiv(Abs(src_width), dst_width);
       *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-    } else if (dst_width > 1) {
+    } else if (src_width > 1 && dst_width > 1) {
       *dx = FixedDiv1(Abs(src_width), dst_width);
       *x = 0;
     }
@@ -1298,35 +1964,6 @@ void ScaleSlope(int src_width,
 }
 #undef CENTERSTART
 
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint16_t* dst,
-                      int dst_width) {
-  const uint16_t* src2 = src_ptr + src_stride;
-
-  int x;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
-    ++src_ptr;
-    ++src2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    uint16_t p0 = src_ptr[0];
-    uint16_t p1 = src_ptr[1];
-    uint16_t p2 = src2[0];
-    uint16_t p3 = src2[1];
-    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
new file mode 100644
index 00000000..17eeffad
--- /dev/null
+++ b/source/scale_gcc.cc
@@ -0,0 +1,2953 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "psrlw       $0x8,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "psrlw       $0x1,%%xmm0                   \n"
+      "psrlw       $0x1,%%xmm1                   \n"
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pavgw       %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+      "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "psrld       $0x18,%%xmm5                  \n"
+      "pslld       $0x10,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x8,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  intptr_t stridex3;
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrlw       $0xf,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "psllw       $0x3,%%xmm5                   \n"
+      "lea         0x00(%4,%4,2),%3              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"
+      "pmaddubsw   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%4,2),%%xmm2          \n"
+      "movdqu      0x10(%0,%4,2),%%xmm3          \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm3                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"
+      "phaddw      %%xmm1,%%xmm0                 \n"
+      "paddw       %%xmm5,%%xmm0                 \n"
+      "psrlw       $0x4,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
+      "vpslld      $0x10,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
+      "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+      "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
+      "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa      %0,%%xmm3                     \n"
+      "movdqa      %1,%%xmm4                     \n"
+      "movdqa      %2,%%xmm5                     \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+  );
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm2               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "palignr     $0x8,%%xmm0,%%xmm1            \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"
+      "pshufb      %%xmm4,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movq        %%xmm1,0x8(%1)                \n"
+      "movq        %%xmm2,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+  );
+  asm volatile(
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+  );
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),                // %0
+                 "+r"(dst_ptr),                // %1
+                 "+r"(dst_width)               // %2
+               : "r"((intptr_t)(src_stride)),  // %3
+                 "m"(kMadd21)                  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      "movdqa      %1,%%xmm3                     \n"  // kShuf11
+      "movdqa      %2,%%xmm4                     \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+  );
+  asm volatile(
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      "movdqa      %1,%%xmm0                     \n"  // kMadd11
+      "movdqa      %2,%%xmm1                     \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+  );
+
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm6                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "pmaddubsw   %%xmm5,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,(%1)                   \n"
+      "movdqu      0x8(%0),%%xmm6                \n"
+      "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "pmaddubsw   %%xmm0,%%xmm6                 \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x8(%1)                \n"
+      "movdqu      0x10(%0),%%xmm6               \n"
+      "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm6,%%xmm7                 \n"
+      "pavgb       %%xmm7,%%xmm6                 \n"
+      "pshufb      %%xmm4,%%xmm6                 \n"
+      "pmaddubsw   %4,%%xmm6                     \n"
+      "paddsw      %%xmm1,%%xmm6                 \n"
+      "psrlw       $0x2,%%xmm6                   \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movq        %%xmm6,0x10(%1)               \n"
+      "lea         0x18(%1),%1                   \n"
+      "sub         $0x18,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),                // %0
+                 "+r"(dst_ptr),                // %1
+                 "+r"(dst_width)               // %2
+               : "r"((intptr_t)(src_stride)),  // %3
+                 "m"(kMadd21)                  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa      %3,%%xmm4                     \n"
+      "movdqa      %4,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "paddusb     %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movd        %%xmm1,0x8(%1)                \n"
+      "lea         0xc(%1),%1                    \n"
+      "sub         $0xc,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "movdqa      %3,%%xmm5                     \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+  );
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm3,%%xmm6                 \n"
+      "paddusw     %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "paddusw     %%xmm0,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,(%1)                   \n"
+      "psrlq       $0x10,%%xmm1                  \n"
+      "movd        %%xmm1,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),               // %0
+                 "+r"(dst_ptr),               // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa      %0,%%xmm2                     \n"
+      "movdqa      %1,%%xmm3                     \n"
+      "movdqa      %2,%%xmm4                     \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+  );
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
+      "movhlps     %%xmm0,%%xmm1                 \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm0                 \n"
+      "punpcklbw   %%xmm5,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqu      0x00(%0,%3,2),%%xmm6          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movhlps     %%xmm6,%%xmm7                 \n"
+      "punpcklbw   %%xmm5,%%xmm6                 \n"
+      "punpcklbw   %%xmm5,%%xmm7                 \n"
+      "paddusw     %%xmm6,%%xmm0                 \n"
+      "paddusw     %%xmm7,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "psrldq      $0x2,%%xmm0                   \n"
+      "paddusw     %%xmm0,%%xmm6                 \n"
+      "pshufb      %%xmm2,%%xmm6                 \n"
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "psrldq      $0x2,%%xmm1                   \n"
+      "paddusw     %%xmm1,%%xmm7                 \n"
+      "pshufb      %%xmm3,%%xmm7                 \n"
+      "paddusw     %%xmm7,%%xmm6                 \n"
+      "pmulhuw     %%xmm4,%%xmm6                 \n"
+      "packuswb    %%xmm6,%%xmm6                 \n"
+      "movd        %%xmm6,(%1)                   \n"
+      "psrlq       $0x10,%%xmm6                  \n"
+      "movd        %%xmm6,0x2(%1)                \n"
+      "lea         0x6(%1),%1                    \n"
+      "sub         $0x6,%2                       \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),               // %0
+                 "+r"(dst_ptr),               // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
+}
+
+static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
+                                        10, 11, 8, 9, 14, 15, 12, 13};
+
+static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
+                                    3, 1, 1, 3, 3, 1, 1, 3};
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $1,%%xmm6                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm1                   \n"  // 01234567
+      "movq        1(%0),%%xmm2                  \n"  // 12345678
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "paddw       %%xmm6,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"
+      "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
+      "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
+
+      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm3                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  asm volatile(
+      LABELALIGN
+      "1:                                        \n"
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      // above line
+      "movq        (%0),%%xmm1                   \n"  // 01234567
+      "movq        1(%0),%%xmm2                  \n"  // 12345678
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"  // near+far
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
+      "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
+
+      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      // below line
+      "movq        (%0,%3),%%xmm6                \n"  // 01234567
+      "movq        1(%0,%3),%%xmm2               \n"  // 12345678
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+      "movdqa      %%xmm6,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
+      "paddw       %%xmm7,%%xmm5                 \n"  // near+far
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
+      "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
+      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
+
+      "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm6,%%xmm2                 \n"  // near+far
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
+
+      // xmm4 xmm1
+      // xmm5 xmm2
+      "pcmpeqw     %%xmm0,%%xmm0                 \n"
+      "psrlw       $15,%%xmm0                    \n"
+      "psllw       $3,%%xmm0                     \n"  // all 8
+
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
+
+      "packuswb    %%xmm7,%%xmm3                 \n"
+      "movdqu      %%xmm3,(%1)                   \n"  // save above line
+
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
+
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
+      "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
+
+      "packuswb    %%xmm2,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width) {
+  asm volatile(
+      "movdqa      %3,%%xmm5                     \n"
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
+      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
+
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
+      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
+
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
+      "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
+
+      "paddw       %%xmm4,%%xmm1                 \n"  // far+2
+      "paddw       %%xmm4,%%xmm3                 \n"  // far+2
+      "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
+      "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
+      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
+      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
+      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
+
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm2,16(%1)                 \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),          // %0
+        "+r"(dst_ptr),          // %1
+        "+r"(dst_width)         // %2
+      : "m"(kLinearShuffleFar)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm7,%%xmm7                 \n"
+      "psrlw       $15,%%xmm7                    \n"
+      "psllw       $3,%%xmm7                     \n"  // all 8
+      "movdqa      %5,%%xmm6                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // above line
+      "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
+      "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
+      "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
+      "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
+      "paddw       %%xmm0,%%xmm1                 \n"  // near+far
+      "paddw       %%xmm2,%%xmm3                 \n"  // near+far
+      "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
+      "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
+      "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
+
+      // below line
+      "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
+      "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
+      "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
+      "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
+      "paddw       %%xmm1,%%xmm4                 \n"  // near+far
+      "paddw       %%xmm3,%%xmm5                 \n"  // near+far
+      "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
+      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+      "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
+      "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
+      "movdqu      %%xmm4,(%1)                   \n"
+
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
+      "movdqu      %%xmm4,0x10(%1)               \n"
+
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
+      "movdqu      %%xmm1,(%1,%4,2)              \n"
+
+      "movdqa      %%xmm3,%%xmm4                 \n"
+      "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+      "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearShuffleFar)        // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqd     %%xmm4,%%xmm4                 \n"
+      "psrld       $31,%%xmm4                    \n"
+      "pslld       $1,%%xmm4                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+
+      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
+
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+
+      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+
+      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
+      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+
+      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+      "packssdw    %%xmm1,%%xmm0                 \n"
+      "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "pcmpeqd     %%xmm6,%%xmm6                 \n"
+      "psrld       $31,%%xmm6                    \n"
+      "pslld       $3,%%xmm6                     \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
+      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+      "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      "movq        (%0,%3,2),%%xmm2              \n"
+      "movq        2(%0,%3,2),%%xmm3             \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
+      "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
+      "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
+      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
+      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
+      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
+      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
+      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
+      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
+      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
+      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
+
+      "packssdw    %%xmm0,%%xmm4                 \n"
+      "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packssdw    %%xmm2,%%xmm5                 \n"
+      "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
+      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqa      %3,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),      // %0
+        "+r"(dst_ptr),      // %1
+        "+r"(dst_width)     // %2
+      : "m"(kLinearMadd31)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
+
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        1(%0,%3),%%xmm4               \n"
+      "punpcklwd   %%xmm1,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm4,%%xmm3                 \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31)            // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vbroadcastf128 %3,%%ymm3                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),      // %0
+        "+r"(dst_ptr),      // %1
+        "+r"(dst_width)     // %2
+      : "m"(kLinearMadd31)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vbroadcastf128 %5,%%ymm7                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+      // ymm0 ymm1
+      // ymm2 ymm3
+
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31)            // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm5                  \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
+      "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
+
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
+
+      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+      "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
+      "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
+      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
+      "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
+      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+      "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
+
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
+      "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm2,32(%1)                 \n"
+
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),          // %0
+        "+r"(dst_ptr),          // %1
+        "+r"(dst_width)         // %2
+      : "m"(kLinearShuffleFar)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
+      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+      "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
+
+      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
+      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
+      "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+      "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
+      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1)                   \n"  // store above
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
+      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
+      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearShuffleFar)        // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $31,%%ymm4,%%ymm4             \n"
+      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+
+      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+
+      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
+      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
+
+      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrld      $31,%%ymm6,%%ymm6             \n"
+      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
+
+      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
+      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
+      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
+      "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
+      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
+      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
+
+      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
+      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+
+               // 16 pixel loop.
+               LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x10(%1),%%xmm1               \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "paddusw     %%xmm2,%%xmm0                 \n"
+      "paddusw     %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+               LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm3                   \n"
+      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  intptr_t x0, x1, temp_pixel;
+  asm volatile(
+      "movd        %6,%%xmm2                     \n"
+      "movd        %7,%%xmm3                     \n"
+      "movl        $0x04040000,%k2               \n"
+      "movd        %k2,%%xmm5                    \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $15,%%xmm7                    \n"  // 0x00010001
+
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "subl        $0x2,%5                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movzwl      0x00(%1,%4,1),%k2             \n"
+      "movd        %k2,%%xmm4                    \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm0                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb     %%xmm7,%%xmm1                 \n"
+      "pmaddubsw   %%xmm0,%%xmm1                 \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm1                   \n"
+      "packuswb    %%xmm1,%%xmm1                 \n"
+      "movd        %%xmm1,%k2                    \n"
+      "mov         %w2,(%0)                      \n"
+      "lea         0x2(%0),%0                    \n"
+      "subl        $0x2,%5                       \n"
+      "jge         2b                            \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "addl        $0x1,%5                       \n"
+      "jl          99f                           \n"
+      "movzwl      0x00(%1,%3,1),%k2             \n"
+      "movd        %k2,%%xmm0                    \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "paddusb     %%xmm7,%%xmm2                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
+      "psrlw       $0x7,%%xmm2                   \n"
+      "packuswb    %%xmm2,%%xmm2                 \n"
+      "movd        %%xmm2,%k2                    \n"
+      "mov         %b2,(%0)                      \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
+#if defined(__x86_64__)
+        "+rm"(dst_width)  // %5
+#else
+        "+m"(dst_width)  // %5
+#endif
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
+#if defined(__x86_64__)
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
+#else
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
+#endif
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+
+               : "+r"(dst_ptr),   // %0
+                 "+r"(src_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "shufps      $0xdd,%%xmm1,%%xmm0           \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+      "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+      "lea         0x20(%0),%0                   \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+               : "+r"(src_argb),              // %0
+                 "+r"(dst_argb),              // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  (void)src_stride;
+  asm volatile(
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd        (%0),%%xmm0                   \n"
+      "movd        0x00(%0,%1,1),%%xmm1          \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%0,%1,2),%%xmm2          \n"
+      "movd        0x00(%0,%4,1),%%xmm3          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "punpckldq   %%xmm3,%%xmm2                 \n"
+      "punpcklqdq  %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile(
+      "lea         0x00(,%1,4),%1                \n"
+      "lea         0x00(%1,%1,2),%4              \n"
+      "lea         0x00(%0,%5,1),%5              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"
+      "movhps      0x00(%0,%1,1),%%xmm0          \n"
+      "movq        0x00(%0,%1,2),%%xmm1          \n"
+      "movhps      0x00(%0,%4,1),%%xmm1          \n"
+      "lea         0x00(%0,%1,4),%0              \n"
+      "movq        (%5),%%xmm2                   \n"
+      "movhps      0x00(%5,%1,1),%%xmm2          \n"
+      "movq        0x00(%5,%1,2),%%xmm3          \n"
+      "movhps      0x00(%5,%4,1),%%xmm3          \n"
+      "lea         0x00(%5,%1,4),%5              \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "pavgb       %%xmm3,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "shufps      $0x88,%%xmm1,%%xmm0           \n"
+      "shufps      $0xdd,%%xmm1,%%xmm2           \n"
+      "pavgb       %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
+      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x5,%%xmm3,%%xmm0            \n"
+      "paddd       %%xmm0,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pshufd      $0x0,%%xmm3,%%xmm3            \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "cmp         $0x0,%4                       \n"
+      "jl          99f                           \n"
+      "sub         $0x4,%4                       \n"
+      "jl          49f                           \n"
+
+      LABELALIGN
+      "40:                                       \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "pextrw      $0x7,%%xmm2,%k1               \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movd        0x00(%3,%0,4),%%xmm1          \n"
+      "movd        0x00(%3,%1,4),%%xmm4          \n"
+      "pextrw      $0x1,%%xmm2,%k0               \n"
+      "pextrw      $0x3,%%xmm2,%k1               \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "punpcklqdq  %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%4                       \n"
+      "jge         40b                           \n"
+
+      "49:                                       \n"
+      "test        $0x2,%4                       \n"
+      "je          29f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        0x00(%3,%1,4),%%xmm1          \n"
+      "pextrw      $0x5,%%xmm2,%k0               \n"
+      "punpckldq   %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%2)                   \n"
+      "lea         0x8(%2),%2                    \n"
+      "29:                                       \n"
+      "test        $0x1,%4                       \n"
+      "je          99f                           \n"
+      "movd        0x00(%3,%0,4),%%xmm0          \n"
+      "movd        %%xmm0,(%2)                   \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpckldq   %%xmm0,%%xmm0                 \n"
+      "punpckhdq   %%xmm1,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%0)                   \n"
+      "movdqu      %%xmm1,0x10(%0)               \n"
+      "lea         0x20(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+
+               : "+r"(dst_argb),  // %0
+                 "+r"(src_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
+  intptr_t x0, x1;
+  asm volatile(
+      "movdqa      %0,%%xmm4                     \n"
+      "movdqa      %1,%%xmm5                     \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile(
+      "movd        %5,%%xmm2                     \n"
+      "movd        %6,%%xmm3                     \n"
+      "pcmpeqb     %%xmm6,%%xmm6                 \n"
+      "psrlw       $0x9,%%xmm6                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "sub         $0x2,%2                       \n"
+      "jl          29f                           \n"
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "paddd       %%xmm3,%%xmm0                 \n"
+      "punpckldq   %%xmm0,%%xmm2                 \n"
+      "punpckldq   %%xmm3,%%xmm3                 \n"
+      "paddd       %%xmm3,%%xmm3                 \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "psrlw       $0x9,%%xmm1                   \n"
+      "movhps      0x00(%1,%4,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm1                 \n"
+      "pmaddubsw   %%xmm1,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "pextrw      $0x1,%%xmm2,%k3               \n"
+      "pextrw      $0x3,%%xmm2,%k4               \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%0)                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "sub         $0x2,%2                       \n"
+      "jge         2b                            \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "add         $0x1,%2                       \n"
+      "jl          99f                           \n"
+      "psrlw       $0x9,%%xmm2                   \n"
+      "movq        0x00(%1,%3,4),%%xmm0          \n"
+      "pshufb      %%xmm5,%%xmm2                 \n"
+      "pshufb      %%xmm4,%%xmm0                 \n"
+      "pxor        %%xmm6,%%xmm2                 \n"
+      "pmaddubsw   %%xmm2,%%xmm0                 \n"
+      "psrlw       $0x7,%%xmm0                   \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movd        %%xmm0,(%0)                   \n"
+
+      LABELALIGN
+      "99:                                       \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile(
+      "cdq                                       \n"
+      "shld        $0x10,%%eax,%%edx             \n"
+      "shl         $0x10,%%eax                   \n"
+      "sub         $0x10001,%%eax                \n"
+      "sbb         $0x0,%%edx                    \n"
+      "sub         $0x1,%1                       \n"
+      "idiv        %1                            \n"
+      "mov         %0, %%eax                     \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
+  return num;
+}
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
+    defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
+                                      6u,   14u,  0x80, 0x80, 0x80, 0x80,
+                                      0x80, 0x80, 0x80, 0x80};
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+      "psrlw       $0xf,%%xmm4                   \n"
+      "packuswb    %%xmm4,%%xmm4                 \n"
+      "pxor        %%xmm5, %%xmm5                \n"  // zero
+      "movdqa      %4,%%xmm1                     \n"  // split shuffler
+      "movdqa      %5,%%xmm3                     \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
+      "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
+      "lea         0x10(%0),%0                   \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
+      "pshufb      %%xmm1,%%xmm2                 \n"
+      "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
+      "pmaddubsw   %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
+      "psrlw       $0x1,%%xmm0                   \n"  // round
+      "pavgw       %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"  // 4 UV
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
+      "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
+      "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
+      "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
+      "lea         0x20(%0),%0                   \n"
+      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
+      "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
+      "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+      "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
+      "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
+      "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
+      "vmovdqu     %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"  // 8 UV
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
+
+static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
+                                      3, 1, 3, 1, 1, 3, 1, 3};
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqa      %3,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "packuswb    %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),        // %0
+        "+r"(dst_ptr),        // %1
+        "+r"(dst_width)       // %2
+      : "m"(kUVLinearMadd31)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqa      %5,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        2(%0,%3),%%xmm4               \n"
+      "punpcklbw   %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm1,%%xmm3                 \n"
+      "punpckldq   %%xmm1,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vbroadcastf128 %3,%%ymm3                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),        // %0
+        "+r"(dst_ptr),        // %1
+        "+r"(dst_width)       // %2
+      : "m"(kUVLinearMadd31)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vbroadcastf128 %5,%%ymm7                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+      // ymm0 ymm1
+      // ymm2 ymm3
+
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31)          // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width) {
+  asm volatile(
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "pcmpeqd     %%xmm4,%%xmm4                 \n"
+      "psrld       $31,%%xmm4                    \n"
+      "pslld       $1,%%xmm4                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+
+      "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
+      "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
+
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+
+      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
+      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
+
+      "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
+      "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+
+      "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+      "packusdw    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width) {
+  asm volatile(
+      "pxor        %%xmm7,%%xmm7                 \n"
+      "pcmpeqd     %%xmm6,%%xmm6                 \n"
+      "psrld       $31,%%xmm6                    \n"
+      "pslld       $3,%%xmm6                     \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+      "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
+      "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
+      "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
+      "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
+      "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
+      "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
+      "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+      "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      "movq        (%0,%3,2),%%xmm2              \n"
+      "movq        4(%0,%3,2),%%xmm3             \n"
+      "punpcklwd   %%xmm7,%%xmm2                 \n"
+      "punpcklwd   %%xmm7,%%xmm3                 \n"
+      "movdqa      %%xmm2,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
+      "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
+      "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
+      "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
+      "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
+      "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
+      "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
+      "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm1,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
+      "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
+      "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
+
+      "packusdw    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packusdw    %%xmm2,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $31,%%ymm4,%%ymm4             \n"
+      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
+      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
+
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+
+      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
+      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
+
+      "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
+      "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
+
+      "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+  asm volatile(
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrld      $31,%%ymm6,%%ymm6             \n"
+      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
+      "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
+      "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
+      "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
+
+      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
+      "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
+      "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+      "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
+      "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
+      "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
+      "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
+      "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
+      "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
+      "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
+
+      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/scale_lsx.cc b/source/scale_lsx.cc
new file mode 100644
index 00000000..bfe5e9fb
--- /dev/null
+++ b/source/scale_lsx.cc
@@ -0,0 +1,739 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_DATA(_src, _in, _out)                                       \
+  {                                                                      \
+    int _tmp1, _tmp2, _tmp3, _tmp4;                                      \
+    DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
+              _tmp2, _tmp3, _tmp4);                                      \
+    _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0);                      \
+    _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1);                      \
+    _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2);                      \
+    _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3);                      \
+  }
+
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  int x;
+  int len = dst_width / 4;
+  (void)src_stride;
+  __m128i src0, src1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    dst0 = __lsx_vpickod_w(src1, src0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  int len = dst_width / 4;
+  (void)src_stride;
+  __m128i src0, src1, tmp0, tmp1, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+    tmp0 = __lsx_vpickev_w(src1, src0);
+    tmp1 = __lsx_vpickod_w(src1, src0);
+    dst0 = __lsx_vavgr_bu(tmp1, tmp0);
+    __lsx_vst(dst0, dst_argb, 0);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int len = dst_width / 4;
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+  __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
+  __m128i reg0, reg1, reg2, reg3;
+  __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
+    DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
+              shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
+    dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+    __lsx_vst(dst0, dst_argb, 0);
+    s += 32;
+    t += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int len = dst_width / 4;
+  int32_t stepx = src_stepx << 2;
+  (void)src_stride;
+  __m128i dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < len; x++) {
+    dst0 = __lsx_vldrepl_w(src_argb, 0);
+    src_argb += stepx;
+    dst1 = __lsx_vldrepl_w(src_argb, 0);
+    src_argb += stepx;
+    dst2 = __lsx_vldrepl_w(src_argb, 0);
+    src_argb += stepx;
+    dst3 = __lsx_vldrepl_w(src_argb, 0);
+    src_argb += stepx;
+    __lsx_vstelm_w(dst0, dst_argb, 0, 0);
+    __lsx_vstelm_w(dst1, dst_argb, 4, 0);
+    __lsx_vstelm_w(dst2, dst_argb, 8, 0);
+    __lsx_vstelm_w(dst3, dst_argb, 12, 0);
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  int len = dst_width / 4;
+  int32_t stepx = src_stepx * 4;
+  const uint8_t* next_argb = src_argb + src_stride;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, dst0;
+
+  for (x = 0; x < len; x++) {
+    tmp0 = __lsx_vldrepl_d(src_argb, 0);
+    src_argb += stepx;
+    tmp1 = __lsx_vldrepl_d(src_argb, 0);
+    src_argb += stepx;
+    tmp2 = __lsx_vldrepl_d(src_argb, 0);
+    src_argb += stepx;
+    tmp3 = __lsx_vldrepl_d(src_argb, 0);
+    src_argb += stepx;
+    tmp4 = __lsx_vldrepl_d(next_argb, 0);
+    next_argb += stepx;
+    tmp5 = __lsx_vldrepl_d(next_argb, 0);
+    next_argb += stepx;
+    tmp6 = __lsx_vldrepl_d(next_argb, 0);
+    next_argb += stepx;
+    tmp7 = __lsx_vldrepl_d(next_argb, 0);
+    next_argb += stepx;
+    DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+    DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
+    DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
+    DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
+    dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+    dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
+    __lsx_vst(dst0, dst_argb, 0);
+    dst_argb += 16;
+  }
+}
+
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  int len = dst_width / 32;
+  __m128i src0, src1, src2, src3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  int x;
+  int len = dst_width / 32;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  int len = dst_width / 32;
+  const uint8_t* src_nex = src_ptr + src_stride;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i dst0, dst1;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+              src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    src_ptr += 64;
+    src_nex += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  int len = dst_width / 16;
+  __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
+    dst0 = __lsx_vpickod_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst, 0);
+    src_ptr += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  int len = dst_width / 16;
+  const uint8_t* ptr1 = src_ptr + src_stride;
+  const uint8_t* ptr2 = ptr1 + src_stride;
+  const uint8_t* ptr3 = ptr2 + src_stride;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
+              src6, src7);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+              reg0, reg1, reg2, reg3);
+    DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
+              src2, src3);
+    DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
+              src6, src7);
+    DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+              src7, tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+              reg4, reg5, reg6, reg7);
+    DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+              reg0, reg1, reg2, reg3);
+    DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, reg0, reg1, reg2, reg3);
+    DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
+    dst0 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst, 0);
+    src_ptr += 64;
+    ptr1 += 64;
+    ptr2 += 64;
+    ptr3 += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x, len;
+  __m128i src0, src1, tmp0;
+  __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
+
+  assert(dst_width % 3 == 0);
+  len = dst_width / 12;
+  (void)src_stride;
+
+  for (x = 0; x < len; x++) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    tmp0 = __lsx_vshuf_b(src1, src0, shuff);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_w(tmp0, dst, 8, 2);
+    src_ptr += 32;
+    dst += 12;
+  }
+}
+
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, len;
+  const uint8_t* src_nex = src_ptr + src_stride;
+  __m128i src0, src1, src2, src3, dst0;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3;
+  __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+  __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
+  __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  len = dst_width / 12;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+    DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+    DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+    tmp4 = __lsx_vpickev_w(reg3, reg2);
+    tmp5 = __lsx_vadd_h(reg0, reg1);
+    tmp6 = __lsx_vadd_h(tmp5, tmp4);
+    tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
+    tmp0 = __lsx_vpickod_w(reg3, reg2);
+    tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+    tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
+    dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+    __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+    __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+    src_ptr += 32;
+    src_nex += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, len;
+  const uint8_t* ptr1 = src_ptr + src_stride;
+  const uint8_t* ptr2 = ptr1 + src_stride;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3, dst0;
+  __m128i zero = __lsx_vldi(0);
+  __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+  __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
+  __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  len = dst_width / 12;
+
+  for (x = 0; x < len; x++) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
+              src2, src3);
+    DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
+    DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+    DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
+    DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+    DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+    tmp4 = __lsx_vpickev_w(reg3, reg2);
+    tmp5 = __lsx_vadd_h(reg0, reg1);
+    tmp6 = __lsx_vadd_h(tmp5, tmp4);
+    tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
+    tmp0 = __lsx_vpickod_w(reg3, reg2);
+    tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+    tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
+    dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+    __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+    __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+    src_ptr += 32;
+    ptr1 += 32;
+    ptr2 += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  int len = src_width / 16;
+  __m128i src0, tmp0, tmp1, dst0, dst1;
+  __m128i zero = __lsx_vldi(0);
+
+  assert(src_width > 0);
+
+  for (x = 0; x < len; x++) {
+    src0 = __lsx_vld(src_ptr, 0);
+    DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
+    tmp0 = __lsx_vilvl_b(zero, src0);
+    tmp1 = __lsx_vilvh_b(zero, src0);
+    DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
+    __lsx_vst(dst0, dst_ptr, 0);
+    __lsx_vst(dst1, dst_ptr, 16);
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+}
+
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  int j;
+  int len = dst_width / 16;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i vec0, vec1, dst0;
+  __m128i vec_x = __lsx_vreplgr2vr_w(x);
+  __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+  __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
+  __m128i const2 = __lsx_vreplgr2vr_w(0x40);
+  __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+  vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+  vec1 = __lsx_vslli_w(vec_dx, 2);
+  vec_x = __lsx_vadd_w(vec_x, vec0);
+
+  for (j = 0; j < len; j++) {
+    tmp0 = __lsx_vsrai_w(vec_x, 16);
+    tmp4 = __lsx_vand_v(vec_x, const1);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    tmp1 = __lsx_vsrai_w(vec_x, 16);
+    tmp5 = __lsx_vand_v(vec_x, const1);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    tmp2 = __lsx_vsrai_w(vec_x, 16);
+    tmp6 = __lsx_vand_v(vec_x, const1);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    tmp3 = __lsx_vsrai_w(vec_x, 16);
+    tmp7 = __lsx_vand_v(vec_x, const1);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
+              tmp6, tmp7);
+    LOAD_DATA(src_ptr, tmp0, reg0);
+    LOAD_DATA(src_ptr, tmp1, reg1);
+    LOAD_DATA(src_ptr, tmp2, reg2);
+    LOAD_DATA(src_ptr, tmp3, reg3);
+    DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
+              tmp2, tmp3);
+    LOAD_DATA(src_ptr, tmp0, reg4);
+    LOAD_DATA(src_ptr, tmp1, reg5);
+    LOAD_DATA(src_ptr, tmp2, reg6);
+    LOAD_DATA(src_ptr, tmp3, reg7);
+    DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
+              reg4, reg5, reg6, reg7);
+    DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
+              reg4, reg5, reg6, reg7);
+    DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
+              const2, reg4, reg5, reg6, reg7);
+    DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
+              reg6, reg7);
+    DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
+    dst0 = __lsx_vpickev_b(tmp1, tmp0);
+    __lsx_vst(dst0, dst_ptr, 0);
+    dst_ptr += 16;
+  }
+}
+
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  const uint32_t* src = (const uint32_t*)src_argb;
+  uint32_t* dst = (uint32_t*)dst_argb;
+  int j;
+  int len = dst_width / 4;
+  __m128i tmp0, tmp1, tmp2, dst0;
+  __m128i vec_x = __lsx_vreplgr2vr_w(x);
+  __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+  __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+  tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
+  tmp1 = __lsx_vslli_w(vec_dx, 2);
+  vec_x = __lsx_vadd_w(vec_x, tmp0);
+
+  for (j = 0; j < len; j++) {
+    tmp2 = __lsx_vsrai_w(vec_x, 16);
+    vec_x = __lsx_vadd_w(vec_x, tmp1);
+    LOAD_DATA(src, tmp2, dst0);
+    __lsx_vst(dst0, dst, 0);
+    dst += 4;
+  }
+}
+
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx) {
+  const uint32_t* src = (const uint32_t*)src_argb;
+  int j;
+  int len = dst_width / 8;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i vec0, vec1, dst0, dst1;
+  __m128i vec_x = __lsx_vreplgr2vr_w(x);
+  __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+  __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+  __m128i const_7f = __lsx_vldi(0x7F);
+
+  vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+  vec1 = __lsx_vslli_w(vec_dx, 2);
+  vec_x = __lsx_vadd_w(vec_x, vec0);
+
+  for (j = 0; j < len; j++) {
+    tmp0 = __lsx_vsrai_w(vec_x, 16);
+    reg0 = __lsx_vsrai_w(vec_x, 9);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    tmp1 = __lsx_vsrai_w(vec_x, 16);
+    reg1 = __lsx_vsrai_w(vec_x, 9);
+    vec_x = __lsx_vadd_w(vec_x, vec1);
+    DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
+    DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
+    DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
+    DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
+    LOAD_DATA(src, tmp0, src0);
+    LOAD_DATA(src, tmp1, src1);
+    DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
+    LOAD_DATA(src, tmp0, src2);
+    LOAD_DATA(src, tmp1, src3);
+    DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
+    DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
+    __lsx_vst(dst0, dst_argb, 0);
+    __lsx_vst(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2;
+  __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
+  __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
+  __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
+              dst1);
+    dst2 = __lsx_vshuf_b(src3, src2, shuff2);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    src_ptr += 64;
+    dst += 48;
+  }
+}
+
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* src_nex = src_ptr + src_stride;
+  int x;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+  __m128i tmp10, tmp11, dst0, dst1, dst2;
+  __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+  __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+  __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+  __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+  __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+  __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+  __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+  __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+  __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+              src4, src5, src6, src7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+              shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+              shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+              shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+              const0, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+              const1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+              const2, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+              shift0, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+              shift1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+              shift2, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
+              tmp7, tmp8);
+    DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
+    DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
+    DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+    DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
+    dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
+    __lsx_vst(dst0, d, 0);
+    __lsx_vst(dst1, d, 16);
+    __lsx_vst(dst2, d, 32);
+    src_ptr += 64;
+    src_nex += 64;
+    d += 48;
+  }
+}
+
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* src_nex = src_ptr + src_stride;
+  int x;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+  __m128i tmp10, tmp11, dst0, dst1, dst2;
+  __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+  __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+  __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+  __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+  __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+  __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+  __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+  __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+  __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+              src4, src5, src6, src7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+              shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+              shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+              shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+              const0, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+              const1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+              const2, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+              shift0, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+              shift1, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+              shift2, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+              src0, src1, src2, src3);
+    DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+    DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
+    dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
+    __lsx_vst(dst0, d, 0);
+    __lsx_vst(dst1, d, 16);
+    __lsx_vst(dst2, d, 32);
+    src_ptr += 64;
+    src_nex += 64;
+    d += 48;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/scale_msa.cc b/source/scale_msa.cc
index 482a521f..482a521f 100644
--- a/files/source/scale_msa.cc
+++ b/source/scale_msa.cc
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
new file mode 100644
index 00000000..ccc75106
--- /dev/null
+++ b/source/scale_neon.cc
@@ -0,0 +1,1533 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into q0, odd into q1
+      "vld2.8      {q0, q1}, [%0]!               \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vst1.8      {q1}, [%1]!                   \n"  // store odd pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %0                        \n"
+      "1:                                        \n"
+      "vld1.8      {q0, q1}, [%0]!               \n"  // load row 1 and post inc
+      "vld1.8      {q2, q3}, [%1]!               \n"  // load row 2 and post inc
+      "subs        %3, %3, #16                   \n"  // 16 processed per loop
+      "vpaddl.u8   q0, q0                        \n"  // row 1 add adjacent
+      "vpaddl.u8   q1, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"  // row 2 add adjacent +
+                                                      // row1
+      "vpadal.u8   q1, q3                        \n"
+      "vrshrn.u16  d0, q0, #2                    \n"  // downshift, round and
+                                                      // pack
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vst1.8      {d2}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
+      "vld1.8      {q1}, [%3]!                   \n"
+      "vld1.8      {q2}, [%4]!                   \n"
+      "vld1.8      {q3}, [%5]!                   \n"
+      "subs        %2, %2, #4                    \n"
+      "vpaddl.u8   q0, q0                        \n"
+      "vpadal.u8   q0, q1                        \n"
+      "vpadal.u8   q0, q2                        \n"
+      "vpadal.u8   q0, q3                        \n"
+      "vpaddl.u16  q0, q0                        \n"
+      "vrshrn.u32  d0, q0, #4                    \n"  // divide by 16 w/rounding
+      "vmovn.u16   d0, q0                        \n"
+      "vst1.32     {d0[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_ptr1),   // %3
+        "+r"(src_ptr2),   // %4
+        "+r"(src_ptr3)    // %5
+      :
+      : "q0", "q1", "q2", "q3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "subs        %2, %2, #24                   \n"
+      "vmov        d2, d3                        \n"  // order d0, d1, d2
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "d0", "d1", "d2", "d3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "vmovl.u8    q8, d4                        \n"
+      "vmovl.u8    q9, d5                        \n"
+      "vmovl.u8    q10, d6                       \n"
+      "vmovl.u8    q11, d7                       \n"
+
+      // 3 * line_0 + line_1
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vmlal.u8    q9, d1, d24                   \n"
+      "vmlal.u8    q10, d2, d24                  \n"
+      "vmlal.u8    q11, d3, d24                  \n"
+
+      // (3 * line_0 + line_1 + 2) >> 2
+      "vqrshrn.u16 d0, q8, #2                    \n"
+      "vqrshrn.u16 d1, q9, #2                    \n"
+      "vqrshrn.u16 d2, q10, #2                   \n"
+      "vqrshrn.u16 d3, q11, #2                   \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+      "vmovl.u8    q8, d1                        \n"
+      "vmlal.u8    q8, d0, d24                   \n"
+      "vqrshrn.u16 d0, q8, #2                    \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+      "vrhadd.u8   d1, d1, d2                    \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+      "vmovl.u8    q8, d2                        \n"
+      "vmlal.u8    q8, d3, d24                   \n"
+      "vqrshrn.u16 d2, q8, #2                    \n"
+
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+        "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8     d24, #3                       \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
+      "subs        %2, %2, #24                   \n"
+      // average src line 0 with src line 1
+      "vrhadd.u8   q0, q0, q2                    \n"
+      "vrhadd.u8   q1, q1, q3                    \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+      "vmovl.u8    q3, d1                        \n"
+      "vmlal.u8    q3, d0, d24                   \n"
+      "vqrshrn.u16 d0, q3, #2                    \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+      "vrhadd.u8   d1, d1, d2                    \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+      "vmovl.u8    q3, d2                        \n"
+      "vmlal.u8    q3, d3, d24                   \n"
+      "vqrshrn.u16 d2, q3, #2                    \n"
+
+      "vst3.8      {d0, d1, d2}, [%1]!           \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                                18, 6, 14, 19, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vld1.8      {q3}, [%3]                    \n"
+      "1:                                        \n"
+      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "subs        %2, %2, #12                   \n"
+      "vtbl.u8     d4, {d0, d1, d2, d3}, d6      \n"
+      "vtbl.u8     d5, {d0, d1, d2, d3}, d7      \n"
+      "vst1.8      {d4}, [%1]!                   \n"
+      "vst1.32     {d5[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile(
+      "vld1.16     {q13}, [%5]                   \n"
+      "vld1.8      {q14}, [%6]                   \n"
+      "vld1.8      {q15}, [%7]                   \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "vld4.8      {d16, d17, d18, d19}, [%4]!   \n"
+      "subs        %2, %2, #12                   \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
+      "vtrn.u8     d16, d17                      \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
+      "vtrn.u8     d18, d19                      \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
+      "vpaddl.u8   q8, q8                        \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
+      "vpaddl.u8   d19, d19                      \n"
+
+      // combine source lines
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    q0, q8                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
+      "vadd.u16    d4, d19                       \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "vqrdmulh.s16 q2, q2, q13                  \n"
+      "vmovn.u16   d4, q2                        \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
+      "vmovl.u8    q9, d18                       \n"
+
+      // combine source lines
+      "vadd.u16    q1, q3                        \n"
+      "vadd.u16    q1, q9                        \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32    d2, d3                        \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16    d2, d3                        \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16    q0, q1                        \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q15                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8     d2, d4                        \n"
+
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
+
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride),    // %3
+        "+r"(src_ptr1)       // %4
+      : "r"(&kMult38_Div6),  // %5
+        "r"(&kShuf38_2),     // %6
+        "r"(&kMult38_Div9)   // %7
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+        "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vld1.16     {q13}, [%4]                   \n"
+      "vld1.8      {q14}, [%5]                   \n"
+      "add         %3, %0                        \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
+      "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
+      "subs        %2, %2, #12                   \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8     d0, d1                        \n"
+      "vtrn.u8     d4, d5                        \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8     d2, d3                        \n"
+      "vtrn.u8     d6, d7                        \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8   q0, q0                        \n"
+      "vpaddl.u8   q2, q2                        \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8   d3, d3                        \n"
+      "vpaddl.u8   d7, d7                        \n"
+
+      // combine source lines
+      "vadd.u16    q0, q2                        \n"
+      "vadd.u16    d4, d3, d7                    \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "vqrshrn.u16 d4, q2, #2                    \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8    q1, d2                        \n"
+      "vmovl.u8    q3, d6                        \n"
+
+      // combine source lines
+      "vadd.u16    q1, q3                        \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32    d2, d3                        \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16    d2, d3                        \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16    q0, q1                        \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q13                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8     d2, d4                        \n"
+
+      "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
+      "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
+
+      "vst1.8      {d3}, [%1]!                   \n"
+      "vst1.32     {d4[0]}, [%1]!                \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride)     // %3
+      : "r"(&kMult38_Div6),  // %4
+        "r"(&kShuf38_2)      // %5
+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "vmov.u8     d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%3]!                   \n"  // 12345678
+
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.8      {d0, d1}, [%1]!               \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 1;
+  const uint8_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%5]!                   \n"  // 12345678
+
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d8}, [%1]!                   \n"
+      "vld1.8      {d9}, [%6]!                   \n"
+
+      "vmovl.u8    q2, d8                        \n"
+      "vmovl.u8    q3, d9                        \n"
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
+
+      // e  o
+      // q1 q0
+      // q3 q2
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      // e  o
+      // q5 q4
+      // q1 q0
+
+      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
+      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
+      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
+
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store
+      "vst2.8      {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+        "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q0}, [%3]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
+
+      "vrshr.u16   q0, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshr.u16   q1, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.16     {d0, d1, d2, d3}, [%1]!       \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q1}, [%5]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
+
+      "vld1.16     {q2}, [%1]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q3}, [%6]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q4, q2                        \n"
+      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (even)
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      "vrshr.u16   q2, q1, #4                    \n"  // 2, even
+      "vrshr.u16   q3, q0, #4                    \n"  // 2, odd
+      "vrshr.u16   q0, q5, #4                    \n"  // 1, even
+      "vrshr.u16   q1, q4, #4                    \n"  // 1, odd
+
+      "vst2.16     {d0, d1, d2, d3}, [%2]!       \n"  // store
+      "vst2.16     {d4, d5, d6, d7}, [%3]!       \n"  // store
+      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+        "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "vmov.u16    d31, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q1}, [%3]!                   \n"  // 12345678 (16b)
+
+      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q3, d1                        \n"  // 4567 (32b)
+      "vmovl.u16   q4, d2                        \n"  // 1234 (32b)
+      "vmovl.u16   q5, d3                        \n"  // 5678 (32b)
+
+      "vmlal.u16   q2, d2, d31                   \n"
+      "vmlal.u16   q3, d3, d31                   \n"
+      "vmlal.u16   q4, d0, d31                   \n"
+      "vmlal.u16   q5, d1, d31                   \n"
+
+      "vrshrn.u32  d0, q4, #2                    \n"
+      "vrshrn.u32  d1, q5, #2                    \n"
+      "vrshrn.u32  d2, q2, #2                    \n"
+      "vrshrn.u32  d3, q3, #2                    \n"
+
+      "vst2.16     {q0, q1}, [%1]!               \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "vmov.u16    d31, #3                       \n"
+      "vmov.u32    q14, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {d0}, [%0]!                   \n"  // 0123 (16b)
+      "vld1.16     {d1}, [%5]!                   \n"  // 1234 (16b)
+      "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q3, d1                        \n"  // 1234 (32b)
+      "vmlal.u16   q2, d1, d31                   \n"
+      "vmlal.u16   q3, d0, d31                   \n"
+
+      "vld1.16     {d0}, [%1]!                   \n"  // 0123 (16b)
+      "vld1.16     {d1}, [%6]!                   \n"  // 1234 (16b)
+      "vmovl.u16   q4, d0                        \n"  // 0123 (32b)
+      "vmovl.u16   q5, d1                        \n"  // 1234 (32b)
+      "vmlal.u16   q4, d1, d31                   \n"
+      "vmlal.u16   q5, d0, d31                   \n"
+
+      "vmovq       q0, q4                        \n"
+      "vmovq       q1, q5                        \n"
+      "vmla.u32    q4, q2, q14                   \n"
+      "vmla.u32    q5, q3, q14                   \n"
+      "vmla.u32    q2, q0, q14                   \n"
+      "vmla.u32    q3, q1, q14                   \n"
+
+      "vrshrn.u32  d1, q4, #4                    \n"
+      "vrshrn.u32  d0, q5, #4                    \n"
+      "vrshrn.u32  d3, q2, #4                    \n"
+      "vrshrn.u32  d2, q3, #4                    \n"
+
+      "vst2.16     {d0, d1}, [%2]!               \n"  // store
+      "vst2.16     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #8                    \n"  // 4 sample -> 8 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+        "d31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "vmov.u8     d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%3]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.16     {d0, d1}, [%1]!               \n"  // store
+      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "d30"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%5]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d8}, [%1]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d9}, [%6]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q2, d8                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q3, d9                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
+
+      // e  o
+      // q1 q0
+      // q3 q2
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      // e  o
+      // q5 q4
+      // q1 q0
+
+      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
+      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
+      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
+
+      "vst2.16     {d0, d1}, [%2]!               \n"  // store
+      "vst2.16     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+        "q15"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width) {
+  const uint16_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "vmov.u16    d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 00112233 (1u1v, 16)
+      "vld1.16     {q1}, [%3]!                   \n"  // 11223344 (1u1v, 16)
+
+      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q3, d2                        \n"  // 1122 (1u1v, 32b)
+      "vmovl.u16   q4, d1                        \n"  // 2233 (1u1v, 32b)
+      "vmovl.u16   q5, d3                        \n"  // 3344 (1u1v, 32b)
+      "vmlal.u16   q2, d2, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (even)
+      "vmlal.u16   q4, d3, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u16   q5, d1, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u32  d1, q2, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u32  d0, q3, #2                    \n"  // 3/4*near+1/4*far (even)
+      "vrshrn.u32  d3, q4, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u32  d2, q5, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.32     {d0, d1}, [%1]!               \n"  // store
+      "vst2.32     {d2, d3}, [%1]!               \n"  // store
+      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+        "d30"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 2;
+  const uint16_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "vmov.u16    d30, #3                       \n"
+      "vmov.u32    q14, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // 0011 (1u1v)
+      "vld1.8      {d1}, [%5]!                   \n"  // 1122 (1u1v)
+      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q3, d1                        \n"  // 1122 (1u1v, 32b)
+      "vmlal.u16   q2, d1, d30                   \n"  // 3*near+far (1, odd)
+      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d0}, [%1]!                   \n"  // 0011 (1u1v)
+      "vld1.8      {d1}, [%6]!                   \n"  // 1122 (1u1v)
+      "vmovl.u16   q4, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q5, d1                        \n"  // 1122 (1u1v, 32b)
+      "vmlal.u16   q4, d1, d30                   \n"  // 3*near+far (2, odd)
+      "vmlal.u16   q5, d0, d30                   \n"  // 3*near+far (2, even)
+
+      "vmovq       q0, q4                        \n"
+      "vmovq       q1, q5                        \n"
+      "vmla.u32    q4, q2, q14                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u32    q5, q3, q14                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u32    q2, q0, q14                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u32    q3, q1, q14                   \n"  // 9 3 3 1 (2, even)
+
+      "vrshrn.u32  d1, q4, #4                    \n"  // 1, odd
+      "vrshrn.u32  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u32  d3, q2, #4                    \n"  // 2, odd
+      "vrshrn.u32  d2, q3, #4                    \n"  // 2, even
+
+      "vst2.32     {d0, d1}, [%2]!               \n"  // store
+      "vst2.32     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #4                    \n"  // 2 uv -> 4 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+        "d30"  // Clobber List
+  );
+}
+
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
+      "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
+      "vaddw.u8    q2, q2, d1                    \n"  // add
+      "vaddw.u8    q1, q1, d0                    \n"
+      "vst1.16     {q1, q2}, [%1]!               \n"  // store accumulator
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_ptr;
+  asm volatile (
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q3, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+      "vadd.s32    q1, q1, q0                    \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+      "vadd.s32    q2, q1, q3                    \n"
+      "vshl.i32    q0, q3, #1                    \n"  // 8 * dx
+      "1:                                        \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+      "vmov        q10, q1                       \n"
+      "vmov        q11, q2                       \n"
+      "vuzp.16     q10, q11                      \n"
+      "vmovl.u8    q8, d6                        \n"
+      "vmovl.u8    q9, d7                        \n"
+      "vsubl.s16   q11, d18, d16                 \n"
+      "vsubl.s16   q12, d19, d17                 \n"
+      "vmovl.u16   q13, d20                      \n"
+      "vmovl.u16   q10, d21                      \n"
+      "vmul.s32    q11, q11, q13                 \n"
+      "vmul.s32    q12, q12, q10                 \n"
+      "vrshrn.s32  d18, q11, #16                 \n"
+      "vrshrn.s32  d19, q12, #16                 \n"
+      "vadd.s16    q8, q8, q9                    \n"
+      "vmovn.s16   d6, q8                        \n"
+
+      "vst1.8      {d6}, [%0]!                   \n"  // store pixels
+      "vadd.s32    q1, q1, q0                    \n"
+      "vadd.s32    q2, q2, q0                    \n"
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "cmp         %4, #0                        \n"
+      "beq         100f                          \n"
+      "add         %2, %1                        \n"
+      "cmp         %4, #64                       \n"
+      "beq         75f                           \n"
+      "cmp         %4, #128                      \n"
+      "beq         50f                           \n"
+      "cmp         %4, #192                      \n"
+      "beq         25f                           \n"
+
+      "vdup.8      d5, %4                        \n"
+      "rsb         %4, #256                      \n"
+      "vdup.8      d4, %4                        \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vmull.u8    q13, d0, d4                   \n"
+      "vmull.u8    q14, d1, d4                   \n"
+      "vmlal.u8    q13, d2, d5                   \n"
+      "vmlal.u8    q14, d3, d5                   \n"
+      "vrshrn.u16  d0, q13, #8                   \n"
+      "vrshrn.u16  d1, q14, #8                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         25b                           \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "vld1.8      {q1}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "vld1.8      {q1}, [%1]!                   \n"
+      "vld1.8      {q0}, [%2]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vrhadd.u8   q0, q1                        \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         75b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8      {q0}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"
+      "vst1.8      {q0}, [%0]!                   \n"
+      "bgt         100b                          \n"
+
+      "99:                                       \n"
+      "vst1.8      {d1[7]}, [%0]                 \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst_width),         // %3
+        "+r"(source_y_fraction)  // %4
+      :
+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vmov        q2, q1                        \n"  // load next 8 ARGB
+      "vst2.32     {q2, q3}, [%1]!               \n"  // store odd pixels
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
+//  4a:  3e04        subs  r6, #4
+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
+//  50:  ef64 21f4   vorr  q9, q10, q10
+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vrhadd.u8   q1, q2, q3                    \n"  // rounding half add
+      "vst2.32     {q0, q1}, [%1]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8      {d16, d18, d20, d22}, [%1]!   \n"  // load 8 more ARGB
+      "vld4.8      {d17, d19, d21, d23}, [%1]!   \n"  // load last 8 ARGB
+      "vpadal.u8   q0, q8                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q10                       \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q3, q11                       \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vrshrn.u16  d2, q2, #2                    \n"
+      "vrshrn.u16  d3, q3, #2                    \n"
+      "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "mov         r12, %3, lsl #2               \n"
+      "1:                                        \n"
+      "vld1.32     {d0[0]}, [%0], r12            \n"
+      "vld1.32     {d0[1]}, [%0], r12            \n"
+      "vld1.32     {d1[0]}, [%0], r12            \n"
+      "vld1.32     {d1[1]}, [%0], r12            \n"
+      "subs        %2, %2, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%1]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      : "r"(src_stepx)   // %3
+      : "memory", "cc", "r12", "q0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "mov         r12, %4, lsl #2               \n"
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0], r12               \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8      {d1}, [%1], r12               \n"
+      "vld1.8      {d2}, [%0], r12               \n"
+      "vld1.8      {d3}, [%1], r12               \n"
+      "vld1.8      {d4}, [%0], r12               \n"
+      "vld1.8      {d5}, [%1], r12               \n"
+      "vld1.8      {d6}, [%0], r12               \n"
+      "vld1.8      {d7}, [%1], r12               \n"
+      "vaddl.u8    q0, d0, d1                    \n"
+      "vaddl.u8    q1, d2, d3                    \n"
+      "vaddl.u8    q2, d4, d5                    \n"
+      "vaddl.u8    q3, d6, d7                    \n"
+      "vswp.8      d1, d2                        \n"  // ab_cd -> ac_bd
+      "vswp.8      d5, d6                        \n"  // ef_gh -> eg_fh
+      "vadd.u16    q0, q0, q1                    \n"  // (a+b)_(c+d)
+      "vadd.u16    q2, q2, q3                    \n"  // (e+f)_(g+h)
+      "vrshrn.u16  d0, q0, #2                    \n"  // first 2 pixels.
+      "vrshrn.u16  d1, q2, #2                    \n"  // next 2 pixels.
+      "subs        %3, %3, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {q0}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_argb),    // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(src_stepx)     // %4
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  int tmp;
+  const uint8_t* src_tmp = src_argb;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(d0, 0)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop
+      "bgt         1b                            \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x),          // %3
+        "+r"(dx),         // %4
+        "=&r"(tmp),       // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
+  "lsr        %5, %3, #16                                \n" \
+  "add        %6, %1, %5, lsl #2                         \n" \
+  "add        %3, %3, %4                                 \n" \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_argb;
+  asm volatile (
+      "vdup.32     q0, %3                        \n"  // x
+      "vdup.32     q1, %4                        \n"  // dx
+      "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
+      "vshl.i32    q9, q1, #2                    \n"  // 4 * dx
+      "vmul.s32    q1, q1, q2                    \n"
+      "vmov.i8     q3, #0x7f                     \n"  // 0x7F
+      "vmov.i16    q15, #0x7f                    \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+      "vadd.s32    q8, q1, q0                    \n"
+      "1:                                        \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.16     {q1}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "vld2.8      {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.8      {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // U 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // V 16 bytes -> 8 shorts.
+      "vld2.8      {d16, d18}, [%1]!             \n"  // load 8 more UV
+      "vld2.8      {d17, d19}, [%1]!             \n"  // load last 8 UV
+      "vpadal.u8   q0, q8                        \n"  // U 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q9                        \n"  // V 16 bytes -> 8 shorts.
+      "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
+      "vrshrn.u16  d1, q1, #2                    \n"
+      "vst2.8      {d0, d1}, [%2]!               \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16     {d0[0]}, [%0], %6             \n"
+      "vld1.16     {d0[1]}, [%1], %6             \n"
+      "vld1.16     {d0[2]}, [%2], %6             \n"
+      "vld1.16     {d0[3]}, [%3], %6             \n"
+      "subs        %5, %5, #4                    \n"  // 4 pixels per loop.
+      "vst1.8      {d0}, [%4]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),      // %0
+        "+r"(src1_ptr),     // %1
+        "+r"(src2_ptr),     // %2
+        "+r"(src3_ptr),     // %3
+        "+r"(dst_ptr),      // %4
+        "+r"(dst_width)     // %5
+      : "r"(src_stepx * 8)  // %6
+      : "memory", "cc", "d0");
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
new file mode 100644
index 00000000..7c072380
--- /dev/null
+++ b/source/scale_neon64.cc
@@ -0,0 +1,1578 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
+      "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
+      "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddlp      v1.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
+      "uadalp      v1.8h, v3.16b                 \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn2      v0.16b, v1.8h, #2             \n"
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
+      "ld1         {v1.16b}, [%2], #16           \n"
+      "ld1         {v2.16b}, [%3], #16           \n"
+      "ld1         {v3.16b}, [%4], #16           \n"
+      "subs        %w5, %w5, #4                  \n"
+      "uaddlp      v0.8h, v0.16b                 \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.8h, v1.16b                 \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "uadalp      v0.8h, v2.16b                 \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+      "uadalp      v0.8h, v3.16b                 \n"
+      "prfm        pldl1keep, [%4, 448]          \n"
+      "addp        v0.8h, v0.8h, v0.8h           \n"
+      "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
+      "st1         {v0.s}[0], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_ptr1),  // %2
+        "+r"(src_ptr2),  // %3
+        "+r"(src_ptr3),  // %4
+        "+r"(dst_width)  // %5
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "subs        %w2, %w2, #24                 \n"
+      "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "ushll       v16.8h, v4.8b, #0             \n"
+      "ushll       v17.8h, v5.8b, #0             \n"
+      "ushll       v18.8h, v6.8b, #0             \n"
+      "ushll       v19.8h, v7.8b, #0             \n"
+
+      // 3 * line_0 + line_1
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "umlal       v17.8h, v1.8b, v20.8b         \n"
+      "umlal       v18.8h, v2.8b, v20.8b         \n"
+      "umlal       v19.8h, v3.8b, v20.8b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // (3 * line_0 + line_1 + 2) >> 2
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
+      "uqrshrn     v1.8b, v17.8h, #2             \n"
+      "uqrshrn     v2.8b, v18.8h, #2             \n"
+      "uqrshrn     v3.8b, v19.8h, #2             \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+      "ushll       v16.8h, v1.8b, #0             \n"
+      "umlal       v16.8h, v0.8b, v20.8b         \n"
+      "uqrshrn     v0.8b, v16.8h, #2             \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+      "ushll       v16.8h, v2.8b, #0             \n"
+      "umlal       v16.8h, v3.8b, v20.8b         \n"
+      "uqrshrn     v2.8b, v16.8h, #2             \n"
+
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "memory", "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi        v20.8b, #3                    \n"
+      "add         %3, %3, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
+      "subs        %w2, %w2, #24                 \n"
+      // average src line 0 with src line 1
+      "urhadd      v0.8b, v0.8b, v4.8b           \n"
+      "urhadd      v1.8b, v1.8b, v5.8b           \n"
+      "urhadd      v2.8b, v2.8b, v6.8b           \n"
+      "urhadd      v3.8b, v3.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+      "ushll       v4.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v0.8b, v20.8b          \n"
+      "uqrshrn     v0.8b, v4.8h, #2              \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+      "urhadd      v1.8b, v1.8b, v2.8b           \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+      "ushll       v4.8h, v2.8b, #0              \n"
+      "umlal       v4.8h, v3.8b, v20.8b          \n"
+      "uqrshrn     v2.8b, v4.8h, #2              \n"
+
+      "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
+}
+
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                                34, 6,  22, 35, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "ld1         {v3.16b}, [%3]                \n"
+      "1:                                        \n"
+      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
+      "subs        %w2, %w2, #12                 \n"
+      "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v2.8b}, [%1], #8             \n"
+      "st1         {v2.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile(
+      "ld1         {v29.8h}, [%5]                \n"
+      "ld1         {v30.16b}, [%6]               \n"
+      "ld1         {v31.8h}, [%7]                \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+      "subs        %w4, %w4, #12                 \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1        v20.8b, v0.8b, v1.8b          \n"
+      "trn2        v21.8b, v0.8b, v1.8b          \n"
+      "trn1        v22.8b, v4.8b, v5.8b          \n"
+      "trn2        v23.8b, v4.8b, v5.8b          \n"
+      "trn1        v24.8b, v16.8b, v17.8b        \n"
+      "trn2        v25.8b, v16.8b, v17.8b        \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
+      "trn1        v16.8b, v18.8b, v19.8b        \n"
+      "trn2        v17.8b, v18.8b, v19.8b        \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp      v20.4h, v20.8b                \n"
+      "uaddlp      v21.4h, v21.8b                \n"
+      "uaddlp      v22.4h, v22.8b                \n"
+      "uaddlp      v23.4h, v23.8b                \n"
+      "uaddlp      v24.4h, v24.8b                \n"
+      "uaddlp      v25.4h, v25.8b                \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
+      "uaddlp      v17.4h, v17.8b                \n"
+
+      // combine source lines
+      "add         v20.4h, v20.4h, v22.4h        \n"
+      "add         v21.4h, v21.4h, v23.4h        \n"
+      "add         v20.4h, v20.4h, v24.4h        \n"
+      "add         v21.4h, v21.4h, v25.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
+      "add         v2.4h, v2.4h, v17.4h          \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
+      "xtn         v2.8b,  v2.8h                 \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "ushll       v16.8h, v16.8b, #0            \n"
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
+
+      // combine source lines
+      "add         v0.8h, v0.8h, v16.8h          \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // 0+1+2, 3+4+5
+      "add         v20.8h, v20.8h, v0.8h         \n"
+      "add         v21.8h, v21.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
+      "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
+      "prfm        pldl1keep, [%3, 448]          \n"
+
+      // Align for table lookup, vtbl requires registers to be adjacent
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(src_ptr1),        // %3
+        "+r"(dst_width)        // %4
+      : "r"(&kMult38_Div6),    // %5
+        "r"(&kShuf38_2),       // %6
+        "r"(&kMult38_Div9)     // %7
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+        "memory", "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile(
+      "ld1         {v30.8h}, [%4]                \n"
+      "ld1         {v31.16b}, [%5]               \n"
+      "add         %2, %2, %0                    \n"
+      "1:                                        \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+      "subs        %w3, %w3, #12                 \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1        v16.8b, v0.8b, v1.8b          \n"
+      "trn2        v17.8b, v0.8b, v1.8b          \n"
+      "trn1        v18.8b, v4.8b, v5.8b          \n"
+      "trn2        v19.8b, v4.8b, v5.8b          \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1        v0.8b, v2.8b, v3.8b           \n"
+      "trn2        v1.8b, v2.8b, v3.8b           \n"
+      "trn1        v4.8b, v6.8b, v7.8b           \n"
+      "trn2        v5.8b, v6.8b, v7.8b           \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp      v16.4h, v16.8b                \n"
+      "uaddlp      v17.4h, v17.8b                \n"
+      "uaddlp      v18.4h, v18.8b                \n"
+      "uaddlp      v19.4h, v19.8b                \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp      v1.4h, v1.8b                  \n"
+      "uaddlp      v5.4h, v5.8b                  \n"
+
+      // combine source lines
+      "add         v16.4h, v16.4h, v18.4h        \n"
+      "add         v17.4h, v17.4h, v19.4h        \n"
+      "add         v2.4h, v1.4h, v5.4h           \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "uqrshrn     v2.8b, v2.8h, #2              \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+      // combine source lines
+      "uaddl       v0.8h, v0.8b, v4.8b           \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1        v1.8h, v0.8h, v0.8h           \n"
+      "trn2        v4.8h, v0.8h, v0.8h           \n"
+      "xtn         v0.4h, v1.4s                  \n"
+      "xtn         v4.4h, v4.4s                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      // 0+1+2, 3+4+5
+      "add         v16.8h, v16.8h, v0.8h         \n"
+      "add         v17.8h, v17.8h, v4.8h         \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
+      "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+
+      "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+      "st1         {v3.8b}, [%1], #8             \n"
+      "st1         {v3.s}[2], [%1], #4           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(dst_width)        // %3
+      : "r"(&kMult38_Div6),    // %4
+        "r"(&kShuf38_2)        // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v30", "v31", "memory", "cc");
+}
+
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 01234567
+      "ldr         d1, [%1], #8                  \n"  // 12345678
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
+
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
+
+      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.8b, v2.8b}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 1;
+  const uint8_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+      "movi        v30.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 01234567
+      "ldr         d1, [%2], #8                  \n"  // 12345678
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v4.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v5.8h, v1.8b, #0              \n"  // 12345678 (16b)
+      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
+      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
+      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
+      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v1.8b, v2.8b}, [%5], #16     \n"  // store 1
+      "st2         {v3.8b, v4.8b}, [%4], #16     \n"  // store 2
+      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v2.16b, v0.16b                \n"
+      "mla         v0.8h, v1.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v1.8h, v2.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "urshr       v2.8h, v0.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "urshr       v1.8h, v1.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.8h, v2.8h}, [%2], #32     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v2.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v3.8h}, [%2], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v0.16b, v2.16b                \n"
+      "mla         v2.8h, v3.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v3.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "ld1         {v4.8h}, [%1], #16            \n"  // 01234567 (16b)
+      "ld1         {v5.8h}, [%3], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v0.16b, v4.16b                \n"
+      "mla         v4.8h, v5.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v5.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.8h, v2.8h, v31.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v31.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v31.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v31.8h          \n"  // 9 3 3 1 (2, even)
+
+      "urshr       v2.8h, v2.8h, #4              \n"  // 2, odd
+      "urshr       v1.8h, v3.8h, #4              \n"  // 2, even
+      "urshr       v4.8h, v4.8h, #4              \n"  // 1, odd
+      "urshr       v3.8h, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v3.8h, v4.8h}, [%4], #32     \n"  // store 1
+      "st2         {v1.8h, v2.8h}, [%5], #32     \n"  // store 2
+
+      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll2      v3.4s, v0.8h, #0              \n"  // 4567 (32b)
+      "ushll       v4.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "ushll2      v5.4s, v1.8h, #0              \n"  // 5678 (32b)
+
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal2      v3.4s, v1.8h, v31.8h          \n"  // 3*near+far (2, odd)
+      "umlal       v4.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (2, even)
+
+      "rshrn       v0.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far
+      "rshrn2      v0.8h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
+      "rshrn       v1.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far
+      "rshrn2      v1.8h, v3.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+
+      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+      "movi        v31.4h, #3                    \n"
+      "movi        v30.4s, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 0123 (16b)
+      "ldr         d1, [%2], #8                  \n"  // 1234 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"  // 0123 (16b)
+      "ldr         d1, [%3], #8                  \n"  // 1234 (16b)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v4.4s, v0.4h, #0              \n"  // 0123 (32b)
+      "ushll       v5.4s, v1.4h, #0              \n"  // 1234 (32b)
+      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
+      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v1.4h, v4.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v0.4h, v5.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v5.4h, v2.4s, #4              \n"  // 3/4*near+1/4*far
+      "rshrn       v4.4h, v3.4s, #4              \n"  // 3/4*near+1/4*far
+
+      "st2         {v0.4h, v1.4h}, [%4], #16     \n"  // store 1
+      "st2         {v4.4h, v5.4h}, [%5], #16     \n"  // store 2
+
+      "subs        %w6, %w6, #8                  \n"  // 4 sample -> 8 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 00112233 (1u1v)
+      "ldr         d1, [%1], #8                  \n"  // 11223344 (1u1v)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 00112233 (1u1v, 16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 11223344 (1u1v, 16b)
+
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
+
+      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.4h, v2.4h}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+      "movi        v30.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"
+      "ldr         d1, [%2], #8                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"
+      "ushll       v3.8h, v1.8b, #0              \n"
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v4.8h, v0.8b, #0              \n"
+      "ushll       v5.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
+      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
+      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
+      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 2
+      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 1
+      "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width) {
+  const uint16_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "ushll2      v4.4s, v0.8h, #0              \n"  // 2233 (1u1v, 32b)
+      "ushll2      v5.4s, v1.8h, #0              \n"  // 3344 (1u1v, 32b)
+
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (even)
+      "umlal2      v4.4s, v1.8h, v31.8h          \n"  // 3*near+far (odd)
+      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "rshrn       v2.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.4h, v3.4s, #2              \n"  // 3/4*near+1/4*far (even)
+      "rshrn       v4.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v3.4h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.2s, v2.2s}, [%2], #16     \n"  // store
+      "st2         {v3.2s, v4.2s}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 2;
+  const uint16_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "movi        v31.4h, #3                    \n"
+      "movi        v30.4s, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"
+      "ldr         d1, [%2], #8                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v4.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v5.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
+      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
+
+      "mov         v0.16b, v4.16b                \n"
+      "mov         v1.16b, v5.16b                \n"
+      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v1.4h, v2.4s, #4              \n"  // 2, odd
+      "rshrn       v0.4h, v3.4s, #4              \n"  // 2, even
+      "rshrn       v3.4h, v4.4s, #4              \n"  // 1, odd
+      "rshrn       v2.4h, v5.4s, #4              \n"  // 1, even
+
+      "st2         {v0.2s, v1.2s}, [%5], #16     \n"  // store 2
+      "st2         {v2.2s, v3.2s}, [%4], #16     \n"  // store 1
+      "subs        %w6, %w6, #4                  \n"  // 2 uv -> 4 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
+// Add a row of bytes to a row of shorts.  Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
+      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
+      "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v1.8h, v1.8h, v0.8b           \n"
+      "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_ptr;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  asm volatile (
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+      "add         v1.4s, v1.4s, v0.4s           \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+      "add         v2.4s, v1.4s, v3.4s           \n"
+      "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
+      "1:                                        \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+      "mov         v6.16b, v1.16b                \n"
+      "mov         v7.16b, v2.16b                \n"
+      "uzp1        v6.8h, v6.8h, v7.8h           \n"
+      "ushll       v4.8h, v4.8b, #0              \n"
+      "ushll       v5.8h, v5.8b, #0              \n"
+      "ssubl       v16.4s, v5.4h, v4.4h          \n"
+      "ssubl2      v17.4s, v5.8h, v4.8h          \n"
+      "ushll       v7.4s, v6.4h, #0              \n"
+      "ushll2      v6.4s, v6.8h, #0              \n"
+      "mul         v16.4s, v16.4s, v7.4s         \n"
+      "mul         v17.4s, v17.4s, v6.4s         \n"
+      "rshrn       v6.4h, v16.4s, #16            \n"
+      "rshrn2      v6.8h, v17.4s, #16            \n"
+      "add         v4.8h, v4.8h, v6.8h           \n"
+      "xtn         v4.8b, v4.8h                  \n"
+
+      "st1         {v4.8b}, [%0], #8             \n"  // store pixels
+      "add         v1.4s, v1.4s, v0.4s           \n"
+      "add         v2.4s, v2.4s, v0.4s           \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "mov         v2.16b, v3.16b                \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "urhadd      v1.16b, v2.16b, v3.16b        \n"
+      "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "rshrn       v2.8b, v2.8h, #2              \n"
+      "rshrn       v3.8b, v3.8h, #2              \n"
+      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.s}[0], [%0], %3           \n"
+      "ld1         {v0.s}[1], [%0], %3           \n"
+      "ld1         {v0.s}[2], [%0], %3           \n"
+      "ld1         {v0.s}[3], [%0], %3           \n"
+      "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.16b}, [%1], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
+      : "r"((int64_t)(src_stepx * 4))  // %3
+      : "memory", "cc", "v0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
+      "ld1         {v1.8b}, [%1], %4             \n"
+      "ld1         {v2.8b}, [%0], %4             \n"
+      "ld1         {v3.8b}, [%1], %4             \n"
+      "ld1         {v4.8b}, [%0], %4             \n"
+      "ld1         {v5.8b}, [%1], %4             \n"
+      "ld1         {v6.8b}, [%0], %4             \n"
+      "ld1         {v7.8b}, [%1], %4             \n"
+      "uaddl       v0.8h, v0.8b, v1.8b           \n"
+      "uaddl       v2.8h, v2.8b, v3.8b           \n"
+      "uaddl       v4.8h, v4.8b, v5.8b           \n"
+      "uaddl       v6.8h, v6.8b, v7.8b           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
+      "mov         v0.d[1], v2.d[0]              \n"
+      "mov         v2.d[0], v16.d[1]             \n"
+      "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
+      "mov         v4.d[1], v6.d[0]              \n"
+      "mov         v6.d[0], v16.d[1]             \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
+      "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
+      "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
+      "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
+      "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
+      "st1         {v0.16b}, [%2], #16           \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
+      : "r"((int64_t)(src_stepx * 4))  // %4
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  int64_t tmp64;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(v0, 0)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      // clang-format on
+      "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
+      "b.gt        1b                            \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x64),        // %3
+        "+r"(dx64),       // %4
+        "=&r"(tmp64),     // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
+  "lsr        %5, %3, #16                           \n" \
+  "add        %6, %1, %5, lsl #2                    \n" \
+  "add        %3, %3, %4                            \n" \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  asm volatile (
+      "dup         v0.4s, %w3                    \n"  // x
+      "dup         v1.4s, %w4                    \n"  // dx
+      "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
+      "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
+      "mul         v1.4s, v1.4s, v2.4s           \n"
+      "movi        v3.16b, #0x7f                 \n"  // 0x7F
+      "movi        v4.8h, #0x7f                  \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+      "add         v5.4s, v1.4s, v0.4s           \n"
+      "1:                                        \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
+      "1:                                        \n"
+      "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
+      "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
+      "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
+      "uaddlp      v1.4s, v1.8h                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
+      "uadalp      v1.4s, v3.8h                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
+      "rshrn2      v0.8h, v1.4s, #2              \n"
+      "st1         {v0.8h}, [%2], #16            \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  asm volatile(
+      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
+      "movi        v0.8h, #9                     \n"  // constants
+      "movi        v1.4s, #3                     \n"
+
+      "1:                                        \n"
+      "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
+      "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
+      "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
+      "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
+      "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
+      "umull       v16.4s, v3.4h, v0.4h          \n"
+      "umull2      v7.4s, v3.8h, v0.8h           \n"
+      "umull       v18.4s, v4.4h, v0.4h          \n"
+      "umull2      v17.4s, v4.8h, v0.8h          \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "uaddw       v16.4s, v16.4s, v6.4h         \n"
+      "uaddl2      v19.4s, v6.8h, v3.8h          \n"
+      "uaddl       v3.4s, v6.4h, v3.4h           \n"
+      "uaddw2      v6.4s, v7.4s, v6.8h           \n"
+      "uaddl2      v7.4s, v5.8h, v4.8h           \n"
+      "uaddl       v4.4s, v5.4h, v4.4h           \n"
+      "uaddw       v18.4s, v18.4s, v5.4h         \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "mla         v16.4s, v4.4s, v1.4s          \n"
+      "mla         v18.4s, v3.4s, v1.4s          \n"
+      "mla         v6.4s, v7.4s, v1.4s           \n"
+      "uaddw2      v4.4s, v17.4s, v5.8h          \n"
+      "uqrshrn     v16.4h,  v16.4s, #4           \n"
+      "mla         v4.4s, v19.4s, v1.4s          \n"
+      "uqrshrn2    v16.8h, v6.4s, #4             \n"
+      "uqrshrn     v17.4h, v18.4s, #4            \n"
+      "uqrshrn2    v17.8h, v4.4s, #4             \n"
+      "st2         {v16.8h-v17.8h}, [%2], #32    \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      : "r"(2LL),          // %4
+        "r"(14LL)          // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19"  // Clobber List
+  );
+}
+
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add         %1, %1, %0                    \n"
+      "1:                                        \n"
+      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
+      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
+      "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "st2         {v0.8b,v1.8b}, [%2], #16      \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             int src_stepx,  // pixel step
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.h}[0], [%0], %6           \n"
+      "ld1         {v1.h}[0], [%1], %6           \n"
+      "ld1         {v2.h}[0], [%2], %6           \n"
+      "ld1         {v3.h}[0], [%3], %6           \n"
+      "subs        %w5, %w5, #4                  \n"  // 4 pixels per loop.
+      "st4         {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),                 // %0
+        "+r"(src1_ptr),                // %1
+        "+r"(src2_ptr),                // %2
+        "+r"(src3_ptr),                // %3
+        "+r"(dst_ptr),                 // %4
+        "+r"(dst_width)                // %5
+      : "r"((int64_t)(src_stepx * 8))  // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc
new file mode 100644
index 00000000..8db59b56
--- /dev/null
+++ b/source/scale_rgb.cc
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h" /* For FilterMode */
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_rgb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Scale a 24 bit image.
+// Converts to ARGB as intermediate step
+
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+             int src_stride_rgb,
+             int src_width,
+             int src_height,
+             uint8_t* dst_rgb,
+             int dst_stride_rgb,
+             int dst_width,
+             int dst_height,
+             enum FilterMode filtering) {
+  int r;
+  uint8_t* src_argb =
+      (uint8_t*)malloc(src_width * src_height * 4 + dst_width * dst_height * 4);
+  uint8_t* dst_argb = src_argb + src_width * src_height * 4;
+
+  if (!src_argb) {
+    return 1;
+  }
+
+  r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width,
+                  src_height);
+  if (!r) {
+    r = ARGBScale(src_argb, src_width * 4, src_width, src_height, dst_argb,
+                  dst_width * 4, dst_width, dst_height, filtering);
+    if (!r) {
+      r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb,
+                      dst_width, dst_height);
+    }
+  }
+  free(src_argb);
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
new file mode 100644
index 00000000..de037e45
--- /dev/null
+++ b/source/scale_rvv.cc
@@ -0,0 +1,1040 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * Contributed by Darren Hsieh <darren.hsieh@sifive.com>
+ * Contributed by Bruce Lai <bruce.lai@sifive.com>
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+// This module is for clang rvv. GCC hasn't supported segment load & store.
+#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
+    defined(__clang__)
+#include <assert.h>
+#include <riscv_vector.h>
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_SCALEADDROW_RVV
+void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  size_t w = (size_t)src_width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
+    // Use widening multiply-add instead of widening + add
+    v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
+    __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_RVV
+void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint64_t* src = (const uint64_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  do {
+    size_t vl = __riscv_vsetvl_e64m8(w);
+    vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
+    vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
+    __riscv_vse32_v_u32m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
+void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  (void)src_stride;
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_odd, v_even, v_dst;
+    vuint32m4_t v_odd_32, v_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
+    v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
+    v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
+void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
+    vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
+    __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
+    v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
+    v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
+    v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
+    v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
+    v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * 2;
+    src1 += vl * 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
+void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  const int stride_byte = src_stepx * 4;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
+    __riscv_vse32_v_u32m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
+void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src0 = (const uint32_t*)(src_argb);
+  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
+  const int stride_byte = src_stepx * 4;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
+    vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
+    vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
+    size_t vl = __riscv_vsetvl_e32m4(w);
+    __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
+                               stride_byte, vl);
+    __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
+                               stride_byte, vl);
+    v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
+    v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
+    v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
+    v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
+    v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
+    v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
+    v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
+    __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
+    w -= vl;
+    src0 += vl * src_stepx;
+    src1 += vl * src_stepx;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_RVV
+void ScaleRowDown2_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_ptr;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
+    vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2LINEAR_RVV
+void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_s0, v_s1, v_dst;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
+    // Use round-to-nearest-up mode for averaging add
+    v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    src_ptr += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN2BOX_RVV
+void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m8_t v_s01, v_t01, v_st01;
+    vuint8m4_t v_dst;
+    __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
+    __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
+    v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
+    __riscv_vse8_v_u8m4(dst, v_dst, vl);
+    w -= vl;
+    s += 2 * vl;
+    t += 2 * vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4_RVV
+void ScaleRowDown4_RVV(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst_ptr,
+                       int dst_width) {
+  size_t w = (size_t)dst_width;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
+    w -= vl;
+    src_ptr += (4 * vl);
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN4BOX_RVV
+void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint8m2_t v_v0, v_v1, v_v2, v_v3;
+    vuint16m4_t v_s01, v_s23, v_t01, v_t23;
+    vuint16m4_t v_u01, v_u23, v_v01, v_v23;
+    vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
+    vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
+    vuint8m2_t v_dst;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
+    v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
+    v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
+    v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
+
+    v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
+    v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
+    v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
+    v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
+
+    __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
+
+    v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
+    v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
+
+    v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
+    v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
+
+    v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
+    v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
+    v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
+    __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    src_ptr1 += 4 * vl;
+    src_ptr2 += 4 * vl;
+    src_ptr3 += 4 * vl;
+    dst_ptr += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_RVV
+void ScaleRowDown34_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  do {
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
+    w -= vl;
+    src_ptr += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
+void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
+    vuint8m2_t v_u0, v_u1, v_u2, v_u3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    if (src_stride == 0) {
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
+      v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
+      v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
+      v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
+      t += 4 * vl;
+    }
+
+    v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
+    v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
+    v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
+    v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
+    v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
+    v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
+    v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
+
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
+void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
+    vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
+    vuint16m4_t v_u1_u16;
+    vuint8m2_t v_a0, v_a1, v_a2;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+    __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
+
+    // Use round-to-nearest-up mode for vnclip & averaging add
+    if (src_stride == 0) {
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
+    } else {
+      vuint8m2_t v_t0, v_t1, v_t2, v_t3;
+      __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
+      v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
+      v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
+      v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
+      v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
+      t += 4 * vl;
+    }
+    // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
+    v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+    v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
+
+    // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+    v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
+    v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
+    v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
+
+    __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
+
+    w -= vl;
+    s += 4 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_RVV
+void ScaleRowDown38_RVV(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  (void)src_stride;
+  assert(dst_width % 3 == 0);
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
+void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 6u);
+  const uint16_t coeff_b = (65536u / 4u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f;
+    vuint16m2_t v_g0, v_g1, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    // Calculate sum of [e00, e21] to v_e
+    // Calculate sum of [f00, f21] to v_f
+    // Calculate sum of [g00, g11] to v_g
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
+void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t w = (size_t)dst_width / 3u;
+  const uint16_t coeff_a = (65536u / 9u);
+  const uint16_t coeff_b = (65536u / 6u);
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  do {
+    vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
+    vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
+    vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
+    vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
+    vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
+    vuint16m2_t v_g0, v_g1, v_g2, v_g;
+    vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
+    size_t vl = __riscv_vsetvl_e8m1(w);
+    // s: e00, e10, e20, f00, f10, f20, g00, g10
+    // t: e01, e11, e21, f01, f11, f21, g01, g11
+    // u: e02, e12, e22, f02, f12, f22, g02, g12
+    __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
+                            &v_s7, src_ptr, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
+                            &v_t7, src_ptr + src_stride, vl);
+    __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
+                            &v_u7, src_ptr + 2 * src_stride, vl);
+    // Calculate sum of [e00, e22]
+    v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
+    v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
+    v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
+    v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
+    v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
+
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
+    v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
+    v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
+    v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
+    // Calculate sum of [f00, f22]
+    v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
+    v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
+    v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
+    v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
+    v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
+
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
+    v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
+    v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
+    v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
+    // Calculate sum of [g00, g12]
+    v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
+    v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
+    v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
+
+    v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
+    v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
+
+    // Average in 16-bit fixed-point
+    v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
+    v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
+    v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
+
+    v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
+    v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
+    v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
+    __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
+    w -= vl;
+    src_ptr += 8 * vl;
+    dst_ptr += 3 * vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
+// platforms only implement non-edge part of image and process edge with scalar.
+
+#ifdef HAS_SCALEROWUP2_LINEAR_RVV
+void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  size_t work_width = (size_t)dst_width - 1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_src_ptr = src_ptr;
+  uint8_t* work_dst_ptr = dst_ptr + 1;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  while (src_width > 0) {
+    vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
+    vuint16m8_t v_src0_u16, v_src1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(src_width);
+    v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
+
+    v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
+    v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
+    v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
+    v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
+
+    v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
+    v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
+
+    src_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += 2 * vl;
+  }
+  dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_RVV
+void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  size_t src_width = work_width >> 1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint8_t* work_d = d + 1;
+  uint8_t* work_e = e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  while (src_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(src_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
+    __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
+
+    src_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += 2 * vl;
+    work_e += 2 * vl;
+  }
+  d[dst_width - 1] =
+      (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
+  e[dst_width - 1] =
+      (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2_RVV
+void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint32_t* src = (const uint32_t*)src_uv;
+  uint16_t* dst = (uint16_t*)dst_uv;
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e32m8(w);
+    vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
+    vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
+    __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
+void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_uv,
+                               int dst_width) {
+  size_t w = (size_t)dst_width;
+  const uint16_t* src = (const uint16_t*)src_uv;
+  (void)src_stride;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m4_t v_u0v0, v_u1v1, v_avg;
+    vuint16m4_t v_u0v0_16, v_u1v1_16;
+    size_t vl = __riscv_vsetvl_e16m4(w);
+    __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
+    v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
+    v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
+    // Use round-to-nearest-up mode for averaging add
+    v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
+    __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
+    w -= vl;
+    src += vl * 2;
+    dst_uv += vl * 2;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_RVV
+void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  const uint8_t* src_uv_row1 = src_uv + src_stride;
+  size_t w = (size_t)dst_width;
+  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up mode(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
+    vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
+    vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
+    vuint16m4_t v_sum0, v_sum1;
+    vuint8m2_t v_dst_u, v_dst_v;
+    size_t vl = __riscv_vsetvl_e8m2(w);
+
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
+                            src_uv, vl);
+    __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
+                            src_uv_row1, vl);
+
+    v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
+    v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
+    v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
+    v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
+
+    v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
+    v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
+    // Use round-to-nearest-up mode for vnclip
+    v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
+    v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
+
+    __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
+
+    dst_uv += 2 * vl;
+    src_uv += 4 * vl;
+    w -= vl;
+    src_uv_row1 += 4 * vl;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN4_RVV
+void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
+                         ptrdiff_t src_stride,
+                         int src_stepx,
+                         uint8_t* dst_uv,
+                         int dst_width) {
+  // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
+  // dst_width = src_width / 4 and src_width is also int.
+  size_t w = (size_t)dst_width * 8;
+  (void)src_stride;
+  (void)src_stepx;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
+    vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
+    // Narrowing without clipping
+    vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
+    vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
+    vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
+    __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
+    w -= vl;
+    src_uv += vl;
+    dst_uv += vl / 4;
+  } while (w > 0);
+}
+#endif
+
+#ifdef HAS_SCALEUVROWDOWNEVEN_RVV
+void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
+                            ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8_t* dst_uv,
+                            int dst_width) {
+  size_t w = (size_t)dst_width;
+  const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
+  const uint16_t* src = (const uint16_t*)(src_uv);
+  uint16_t* dst = (uint16_t*)(dst_uv);
+  (void)src_stride;
+  do {
+    size_t vl = __riscv_vsetvl_e16m8(w);
+    vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
+    __riscv_vse16_v_u16m8(dst, v_row, vl);
+    w -= vl;
+    src += vl * src_stepx;
+    dst += vl;
+  } while (w > 0);
+}
+#endif
+
+// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
+// ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
+// Other platforms only implement non-edge part of image and process edge with
+// scalar.
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
+  const uint8_t* work_src_ptr = src_ptr;
+  size_t vl = __riscv_vsetvlmax_e8m4();
+  vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
+  dst_ptr[0] = src_ptr[0];
+  dst_ptr[1] = src_ptr[1];
+  while (work_width > 0) {
+    vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
+    vuint16m4_t v_dst_odd, v_dst_even;
+    vuint16m8_t v_uv0_u16, v_uv1_u16;
+    size_t vl = __riscv_vsetvl_e8m4(work_width);
+    v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
+    v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
+
+    v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
+    v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
+
+    v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
+    v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
+
+    v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
+    v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
+
+    v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
+    v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
+
+    work_width -= vl;
+    work_src_ptr += vl;
+    work_dst_ptr += vl;
+  }
+  dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
+  dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  size_t work_width = ((size_t)dst_width - 1u) & ~1u;
+  const uint8_t* work_s = src_ptr;
+  const uint8_t* work_t = src_ptr + src_stride;
+  const uint8_t* s = work_s;
+  const uint8_t* t = work_t;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  uint16_t* work_d = (uint16_t*)d + 1;
+  uint16_t* work_e = (uint16_t*)e + 1;
+  size_t vl = __riscv_vsetvlmax_e16m4();
+  vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
+  vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
+  d[0] = (3 * s[0] + t[0] + 2) >> 2;
+  e[0] = (s[0] + 3 * t[0] + 2) >> 2;
+  d[1] = (3 * s[1] + t[1] + 2) >> 2;
+  e[1] = (s[1] + 3 * t[1] + 2) >> 2;
+  while (work_width > 0) {
+    vuint8m2_t v_s0, v_s1, v_t0, v_t1;
+    vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
+    vuint16m4_t v_t0_u16_, v_t1_u16_;
+    vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
+    vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
+    size_t vl = __riscv_vsetvl_e8m2(work_width);
+    v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
+    v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
+
+    v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
+    v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
+    v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
+    v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
+
+    v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
+    v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
+
+    v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
+    v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
+    v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
+    v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
+
+    v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
+    v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
+
+    v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
+    v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
+    v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
+    v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
+
+    v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
+    v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
+    v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
+    v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
+
+    v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
+    v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
+    v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
+    v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
+
+    __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
+    __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
+
+    work_width -= vl;
+    work_s += vl;
+    work_t += vl;
+    work_d += vl;
+    work_e += vl;
+  }
+  d[2 * dst_width - 2] =
+      (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  e[2 * dst_width - 2] =
+      (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
+      2;
+  d[2 * dst_width - 1] =
+      (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+  e[2 * dst_width - 1] =
+      (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
+      2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
+        // defined(__clang__)
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
new file mode 100644
index 00000000..0931c89a
--- /dev/null
+++ b/source/scale_uv.cc
@@ -0,0 +1,1210 @@
+/*
+ *  Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_uv,
+                         int x,
+                         int dx,
+                         int y,
+                         int dy,
+                         enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleUVRowDown2_C
+          : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+                                        : ScaleUVRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+  } else {
+    src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
+  }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_RVV
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_RVV
+                                          : ScaleUVRowDown2Box_RVV);
+  }
+#endif
+
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+                                          : ScaleUVRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+                                            : ScaleUVRowDown2Box_SSSE3);
+    }
+  }
+#endif
+
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+                                          : ScaleUVRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+                                            : ScaleUVRowDown2Box_MSA);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif  // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static int ScaleUVDown4Box(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_uv,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy) {
+  int j;
+  // Allocate 2 rows of UV.
+  const int row_size = (dst_width * 2 * 2 + 15) & ~15;
+  align_buffer_64(row, row_size * 2);
+  if (!row)
+    return 1;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                          uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_RVV;
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+    ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size,
+                    dst_width * 2);
+    ScaleUVRowDown2(row, row_size, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+  return 0;
+}
+#endif  // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_uv,
+                            uint8_t* dst_uv,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy,
+                            enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
+  void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+                             int src_step, uint8_t* dst_uv, int dst_width) =
+      filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+                                   : ScaleUVRowDownEven_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+    ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif  // TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+                                   : ScaleUVRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVRowDownEven =
+        filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDownEven =
+          filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
+  if (TestCpuFlag(kCpuHasRVV) && !filtering) {
+    ScaleUVRowDownEven =
+        (col_step == 4) ? ScaleUVRowDown4_RVV : ScaleUVRowDownEven_RVV;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+    src_uv += row_stride;
+    dst_uv += dst_stride;
+  }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static int ScaleUVBilinearDown(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_uv,
+                               uint8_t* dst_uv,
+                               int x,
+                               int dx,
+                               int y,
+                               int dy,
+                               enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
+  src_uv += xl * 2;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of UV.
+  {
+    const int max_y = (src_height - 1) << 16;
+    align_buffer_64(row, clip_src_width * 2);
+    if (!row)
+      return 1;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static int ScaleUVBilinearUp(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_uv,
+                             uint8_t* dst_uv,
+                             int x,
+                             int dx,
+                             int y,
+                             int dy,
+                             enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv,
+                            int dst_width, int x, int dx) =
+      filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+  }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVFilterCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleUVFilterCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
+
+    // Allocate 2 rows of UV.
+    const int row_size = (dst_width * 2 + 15) & ~15;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+
+    uint8_t* rowptr = row;
+    int rowstride = row_size;
+    int lasty = yi;
+
+    ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    if (src_height > 2) {
+      src += src_stride;
+    }
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_uv + yi * (intptr_t)src_stride;
+        }
+        if (yi != lasty) {
+          ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          if ((y + 65536) < max_y) {
+            src += src_stride;
+          }
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+      }
+      dst_uv += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
+#endif  // HAS_SCALEUVBILINEARUP
+
+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+static void ScaleUVLinearUp2(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_uv,
+                             uint8_t* dst_uv) {
+  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowUp2_Linear_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_RVV;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
+      dst_uv += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+static void ScaleUVBilinearUp2(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint8_t* src_ptr,
+                               uint8_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleUVRowUp2_Bilinear_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
+  if (TestCpuFlag(kCpuHasRVV)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_RVV;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+// Scale 16 bit UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of P210 to P410.
+static void ScaleUVLinearUp2_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_uv,
+                                uint16_t* dst_uv) {
+  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+      ScaleUVRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
+      dst_uv += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale 16 bit UV, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of P010 to P410.
+static void ScaleUVBilinearUp2_16(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleUVRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO(fbarchard): Test performance of writing one row of destination at a
+    // time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_uv,
+                          int x,
+                          int dx,
+                          int y,
+                          int dy) {
+  int j;
+  void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width,
+                      int x, int dx) =
+      (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+  (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleUVCols = ScaleUVCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVCols = ScaleUVCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleUVCols = ScaleUVCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVCols = ScaleUVCols_MSA;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+      ScaleUVCols = ScaleUVColsUp2_SSSE3;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
+                dx);
+    dst_uv += dst_stride;
+    y += dy;
+  }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+  return 0;
+}
+
+static int UVCopy_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+  return 0;
+}
+#endif  // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static int ScaleUV(const uint8_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   int clip_x,
+                   int clip_y,
+                   int clip_width,
+                   int clip_height,
+                   enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // UV does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64_t clipf = (int64_t)(clip_x)*dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 2;
+    dst += clip_x * 2;
+  }
+  if (clip_y) {
+    int64_t clipf = (int64_t)(clip_y)*dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * (intptr_t)src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst, x, dx, y, dy,
+                       filtering);
+          return 0;
+        }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          return ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+                                 src_stride, dst_stride, src, dst, x, dx, y,
+                                 dy);
+        }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+        ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
+        return 0;
+#endif
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
+                 src_stride, dst, dst_stride, clip_width, clip_height);
+          return 0;
+        }
+#endif
+      }
+    }
+  }
+  // HAS_SCALEPLANEVERTICAL
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering);
+    return 0;
+  }
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
+    ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+                     dst_stride, src, dst);
+    return 0;
+  }
+  if ((clip_height + 1) / 2 == src_height &&
+      (clip_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst);
+    return 0;
+  }
+#if HAS_SCALEUVBILINEARUP
+  if (filtering && dy < 65536) {
+    return ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+                             src_stride, dst_stride, src, dst, x, dx, y, dy,
+                             filtering);
+  }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+  if (filtering) {
+    return ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+                               src_stride, dst_stride, src, dst, x, dx, y, dy,
+                               filtering);
+  }
+#endif
+  ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                dst_stride, src, dst, x, dx, y, dy);
+  return 0;
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+            int src_stride_uv,
+            int src_width,
+            int src_height,
+            uint8_t* dst_uv,
+            int dst_stride_uv,
+            int dst_width,
+            int dst_height,
+            enum FilterMode filtering) {
+  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  return ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv,
+                 dst_stride_uv, dst_width, dst_height, 0, 0, dst_width,
+                 dst_height, filtering);
+}
+
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering) {
+  int dy = 0;
+
+  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  // UV does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  src_width = Abs(src_width);
+
+#ifdef HAS_UVCOPY
+  if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
+    if (dst_height == 1) {
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
+                src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+    } else {
+      dy = src_height / dst_height;
+      UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+                (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+                dst_width, dst_height);
+    }
+
+    return 0;
+  }
+#endif
+
+  if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) {
+    ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
+                        src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+    return 0;
+  }
+
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
+                          src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+    return 0;
+  }
+
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/files/source/scale_win.cc b/source/scale_win.cc
index c5fc86f3..ea1f95c6 100644
--- a/files/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    !defined(__clang__) && defined(_M_IX86)
 
 // Offsets for source bytes 0 to 9
 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
diff --git a/source/test.sh b/source/test.sh
new file mode 100755
index 00000000..7f12c3c1
--- /dev/null
+++ b/source/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -x
+
+function runbenchmark1 {
+  perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
+  perf report | grep AVX
+}
+
+runbenchmark1 ABGRToI420
+runbenchmark1 Android420ToI420
+runbenchmark1 ARGBToI420
+runbenchmark1 Convert16To8Plane
+runbenchmark1 ConvertToARGB
+runbenchmark1 ConvertToI420
+runbenchmark1 CopyPlane
+runbenchmark1 H010ToAB30
+runbenchmark1 H010ToAR30
+runbenchmark1 HalfFloatPlane
+runbenchmark1 I010ToAB30
+runbenchmark1 I010ToAR30
+runbenchmark1 I420Copy
+runbenchmark1 I420Psnr
+runbenchmark1 I420Scale
+runbenchmark1 I420Ssim
+runbenchmark1 I420ToARGB
+runbenchmark1 I420ToNV12
+runbenchmark1 I420ToUYVY
+runbenchmark1 I422ToI420
+runbenchmark1 InitCpuFlags
+runbenchmark1 J420ToARGB
+runbenchmark1 NV12ToARGB
+runbenchmark1 NV12ToI420
+runbenchmark1 NV12ToI420Rotate
+runbenchmark1 SetCpuFlags
+runbenchmark1 YUY2ToI420
diff --git a/files/source/video_common.cc b/source/video_common.cc
index 92384c05..92384c05 100644
--- a/files/source/video_common.cc
+++ b/source/video_common.cc
diff --git a/tools_libyuv/OWNERS b/tools_libyuv/OWNERS
new file mode 100644
index 00000000..aae4fb6e
--- /dev/null
+++ b/tools_libyuv/OWNERS
@@ -0,0 +1,4 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
+
diff --git a/tools_libyuv/autoroller/roll_deps.py b/tools_libyuv/autoroller/roll_deps.py
new file mode 100755
index 00000000..d5c1089f
--- /dev/null
+++ b/tools_libyuv/autoroller/roll_deps.py
@@ -0,0 +1,822 @@
+#!/usr/bin/env vpython3
+
+# Copyright (c) 2017 The LibYUV project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+"""Script to automatically roll dependencies in the LibYUV DEPS file."""
+
+
+import argparse
+import base64
+import collections
+import logging
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+
+
+def FindSrcDirPath():
+  """Returns the abs path to the src/ dir of the project."""
+  src_dir = os.path.dirname(os.path.abspath(__file__))
+  while os.path.basename(src_dir) != 'src':
+    src_dir = os.path.normpath(os.path.join(src_dir, os.pardir))
+  return src_dir
+
+
+# Skip these dependencies (list without solution name prefix).
+DONT_AUTOROLL_THESE = [
+    'third_party/fuchsia-gn-sdk',
+    'src/third_party/gflags/src',
+    'src/third_party/mockito/src',
+]
+
+# These dependencies are missing in chromium/src/DEPS, either unused or already
+# in-tree. For instance, src/base is a part of the Chromium source git repo,
+# but we pull it through a subtree mirror, so therefore it isn't listed in
+# Chromium's deps but it is in ours.
+LIBYUV_ONLY_DEPS = [
+    'src/base',
+    'src/build',
+    'src/buildtools',
+    'src/ios',
+    'src/testing',
+    'src/third_party',
+    'src/third_party/android_support_test_runner',
+    'src/third_party/bazel',
+    'src/third_party/bouncycastle',
+    'src/third_party/errorprone/lib',
+    'src/third_party/findbugs',
+    'src/third_party/gson',
+    'src/third_party/gtest-parallel',
+    'src/third_party/guava',
+    'src/third_party/intellij',
+    'src/third_party/jsr-305/src',
+    'src/third_party/ow2_asm',
+    'src/third_party/proguard',
+    'src/third_party/ub-uiautomator/lib',
+    'src/tools',
+    'src/tools/clang/dsymutil',
+]
+
+LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv'
+CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src'
+CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s'
+CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s'
+CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
+
+COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([-0-9a-z]+)\'$')
+ROLL_BRANCH_NAME = 'roll_chromium_revision'
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHECKOUT_SRC_DIR = FindSrcDirPath()
+CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir))
+
+# Copied from tools/android/roll/android_deps/.../BuildConfigGenerator.groovy.
+ANDROID_DEPS_START = r'=== ANDROID_DEPS Generated Code Start ==='
+ANDROID_DEPS_END = r'=== ANDROID_DEPS Generated Code End ==='
+# Location of automically gathered android deps.
+ANDROID_DEPS_PATH = 'src/third_party/android_deps/'
+
+sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
+import find_depot_tools
+
+find_depot_tools.add_depot_tools_to_path()
+
+CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
+CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
+                                              'clang', 'scripts', 'update.py')
+
+DepsEntry = collections.namedtuple('DepsEntry', 'path url revision')
+ChangedDep = collections.namedtuple('ChangedDep',
+                                    'path url current_rev new_rev')
+CipdDepsEntry = collections.namedtuple('CipdDepsEntry', 'path packages')
+VersionEntry = collections.namedtuple('VersionEntry', 'version')
+ChangedCipdPackage = collections.namedtuple(
+    'ChangedCipdPackage', 'path package current_version new_version')
+ChangedVersionEntry = collections.namedtuple(
+    'ChangedVersionEntry', 'path current_version new_version')
+
+ChromiumRevisionUpdate = collections.namedtuple('ChromiumRevisionUpdate',
+                                                ('current_chromium_rev '
+                                                 'new_chromium_rev '))
+
+
+class RollError(Exception):
+  pass
+
+
+def StrExpansion():
+  return lambda str_value: str_value
+
+
+def VarLookup(local_scope):
+  return lambda var_name: local_scope['vars'][var_name]
+
+
+def ParseDepsDict(deps_content):
+  local_scope = {}
+  global_scope = {
+      'Str': StrExpansion(),
+      'Var': VarLookup(local_scope),
+      'deps_os': {},
+  }
+  exec(deps_content, global_scope, local_scope)
+  return local_scope
+
+
+def ParseLocalDepsFile(filename):
+  with open(filename, 'rb') as f:
+    deps_content = f.read().decode('utf-8')
+  return ParseDepsDict(deps_content)
+
+
+def ParseCommitPosition(commit_message):
+  for line in reversed(commit_message.splitlines()):
+    m = COMMIT_POSITION_RE.match(line.strip())
+    if m:
+      return int(m.group(1))
+  logging.error('Failed to parse commit position id from:\n%s\n',
+                commit_message)
+  sys.exit(-1)
+
+
+def _RunCommand(command,
+                working_dir=None,
+                ignore_exit_code=False,
+                extra_env=None,
+                input_data=None):
+  """Runs a command and returns the output from that command.
+
+    If the command fails (exit code != 0), the function will exit the process.
+
+    Returns:
+      A tuple containing the stdout and stderr outputs as strings.
+    """
+  working_dir = working_dir or CHECKOUT_SRC_DIR
+  logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
+  env = os.environ.copy()
+  if extra_env:
+    assert all(isinstance(value, str) for value in extra_env.values())
+    logging.debug('extra env: %s', extra_env)
+    env.update(extra_env)
+  p = subprocess.Popen(command,
+                       stdin=subprocess.PIPE,
+                       stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE,
+                       env=env,
+                       cwd=working_dir,
+                       universal_newlines=True)
+  std_output, err_output = p.communicate(input_data)
+  p.stdout.close()
+  p.stderr.close()
+  if not ignore_exit_code and p.returncode != 0:
+    logging.error('Command failed: %s\n'
+                  'stdout:\n%s\n'
+                  'stderr:\n%s\n', ' '.join(command), std_output, err_output)
+    sys.exit(p.returncode)
+  return std_output, err_output
+
+
+def _GetBranches():
+  """Returns a tuple of active,branches.
+
+    The 'active' is the name of the currently active branch and 'branches' is a
+    list of all branches.
+    """
+  lines = _RunCommand(['git', 'branch'])[0].split('\n')
+  branches = []
+  active = ''
+  for line in lines:
+    if '*' in line:
+      # The assumption is that the first char will always be the '*'.
+      active = line[1:].strip()
+      branches.append(active)
+    else:
+      branch = line.strip()
+      if branch:
+        branches.append(branch)
+  return active, branches
+
+
+def _ReadGitilesContent(url):
+  # Download and decode BASE64 content until
+  # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed.
+  base64_content = ReadUrlContent(url + '?format=TEXT')
+  return base64.b64decode(base64_content[0]).decode('utf-8')
+
+
+def ReadRemoteCrFile(path_below_src, revision):
+  """Reads a remote Chromium file of a specific revision.
+
+    Args:
+      path_below_src: A path to the target file relative to src dir.
+      revision: Revision to read.
+    Returns:
+      A string with file content.
+    """
+  return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE %
+                             (revision, path_below_src))
+
+
+def ReadRemoteCrCommit(revision):
+  """Reads a remote Chromium commit message. Returns a string."""
+  return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision)
+
+
+def ReadUrlContent(url):
+  """Connect to a remote host and read the contents.
+
+    Args:
+      url: URL to connect to.
+    Returns:
+      A list of lines.
+    """
+  conn = urllib.request.urlopen(url)
+  try:
+    return conn.readlines()
+  except IOError as e:
+    logging.exception('Error connecting to %s. Error: %s', url, e)
+    raise
+  finally:
+    conn.close()
+
+
+def GetMatchingDepsEntries(depsentry_dict, dir_path):
+  """Gets all deps entries matching the provided path.
+
+    This list may contain more than one DepsEntry object.
+    Example: dir_path='src/testing' would give results containing both
+    'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's
+    DEPS.
+    Example 2: dir_path='src/build' should return 'src/build' but not
+    'src/buildtools'.
+
+    Returns:
+      A list of DepsEntry objects.
+    """
+  result = []
+  for path, depsentry in depsentry_dict.items():
+    if path == dir_path:
+      result.append(depsentry)
+    else:
+      parts = path.split('/')
+      if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))):
+        result.append(depsentry)
+  return result
+
+
+def BuildDepsentryDict(deps_dict):
+  """Builds a dict of paths to DepsEntry objects from a raw deps dict."""
+  result = {}
+
+  def AddDepsEntries(deps_subdict):
+    for path, dep in deps_subdict.items():
+      if path in result:
+        continue
+      if not isinstance(dep, dict):
+        dep = {'url': dep}
+      if dep.get('dep_type') == 'cipd':
+        result[path] = CipdDepsEntry(path, dep['packages'])
+      else:
+        if '@' not in dep['url']:
+          continue
+        url, revision = dep['url'].split('@')
+        result[path] = DepsEntry(path, url, revision)
+
+  def AddVersionEntry(vars_subdict):
+    for key, value in vars_subdict.items():
+      if key in result:
+        continue
+      if not key.endswith('_version'):
+        continue
+      key = re.sub('_version$', '', key)
+      result[key] = VersionEntry(value)
+
+  AddDepsEntries(deps_dict['deps'])
+  for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']:
+    AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {}))
+  AddVersionEntry(deps_dict.get('vars', {}))
+  return result
+
+
+def _FindChangedCipdPackages(path, old_pkgs, new_pkgs):
+  old_pkgs_names = {p['package'] for p in old_pkgs}
+  new_pkgs_names = {p['package'] for p in new_pkgs}
+  pkgs_equal = (old_pkgs_names == new_pkgs_names)
+  added_pkgs = [p for p in new_pkgs_names if p not in old_pkgs_names]
+  removed_pkgs = [p for p in old_pkgs_names if p not in new_pkgs_names]
+
+  assert pkgs_equal, ('Old: %s\n New: %s.\nYou need to do a manual roll '
+                      'and remove/add entries in DEPS so the old and new '
+                      'list match.\nMost likely, you should add \"%s\" and '
+                      'remove \"%s\"' %
+                      (old_pkgs, new_pkgs, added_pkgs, removed_pkgs))
+
+  for old_pkg in old_pkgs:
+    for new_pkg in new_pkgs:
+      old_version = old_pkg['version']
+      new_version = new_pkg['version']
+      if (old_pkg['package'] == new_pkg['package']
+          and old_version != new_version):
+        logging.debug('Roll dependency %s to %s', path, new_version)
+        yield ChangedCipdPackage(path, old_pkg['package'], old_version,
+                                 new_version)
+
+
+def _FindChangedVars(name, old_version, new_version):
+  if old_version != new_version:
+    logging.debug('Roll dependency %s to %s', name, new_version)
+    yield ChangedVersionEntry(name, old_version, new_version)
+
+
+def _FindNewDeps(old, new):
+  """ Gather dependencies only in `new` and return corresponding paths. """
+  old_entries = set(BuildDepsentryDict(old))
+  new_entries = set(BuildDepsentryDict(new))
+  return [
+      path for path in new_entries - old_entries
+      if path not in DONT_AUTOROLL_THESE
+  ]
+
+
+def FindAddedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate new deps entries of interest.
+
+    Ideally, that would mean: only appearing in chromium DEPS
+    but transitively used in LibYUV.
+
+    Since it's hard to compute, we restrict ourselves to a well defined subset:
+    deps sitting in `ANDROID_DEPS_PATH`.
+    Otherwise, assumes that's a Chromium-only dependency.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a new package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths added dependencies sitting in `ANDROID_DEPS_PATH`.
+        A list of paths for other added dependencies.
+    """
+  all_added_deps = _FindNewDeps(libyuv_deps, new_cr_deps)
+  generated_android_deps = [
+      path for path in all_added_deps if path.startswith(ANDROID_DEPS_PATH)
+  ]
+  other_deps = [
+      path for path in all_added_deps if path not in generated_android_deps
+  ]
+  return generated_android_deps, other_deps
+
+
+def FindRemovedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate obsolete deps entries.
+
+    Ideally, that would mean: no more appearing in chromium DEPS
+    and not used in LibYUV.
+
+    Since it's hard to compute:
+     1/ We restrict ourselves to a well defined subset:
+        deps sitting in `ANDROID_DEPS_PATH`.
+     2/ We rely on existing behavior of CalculateChangeDeps.
+        I.e. Assumes non-CIPD dependencies are LibYUV-only, don't remove them.
+
+    Args:
+      libyuv_deps: dict of deps as defined in the LibYUV DEPS file.
+      new_cr_deps: dict of deps as defined in the chromium DEPS file.
+
+    Caveat: Doesn't detect a deleted package in existing dep.
+
+    Returns:
+      A tuple consisting of:
+        A list of paths of dependencies removed from `ANDROID_DEPS_PATH`.
+        A list of paths of unexpected disappearing dependencies.
+    """
+  all_removed_deps = _FindNewDeps(new_cr_deps, libyuv_deps)
+  generated_android_deps = sorted(
+      [path for path in all_removed_deps if path.startswith(ANDROID_DEPS_PATH)])
+  # Webrtc-only dependencies are handled in CalculateChangedDeps.
+  other_deps = sorted([
+      path for path in all_removed_deps
+      if path not in generated_android_deps and path not in LIBYUV_ONLY_DEPS
+  ])
+  return generated_android_deps, other_deps
+
+
+def CalculateChangedDeps(libyuv_deps, new_cr_deps):
+  """
+    Calculate changed deps entries based on entries defined in the LibYUV DEPS
+    file:
+     - If a shared dependency with the Chromium DEPS file: roll it to the same
+       revision as Chromium (i.e. entry in the new_cr_deps dict)
+     - If it's a Chromium sub-directory, roll it to the HEAD revision (notice
+       this means it may be ahead of the chromium_revision, but generally these
+       should be close).
+     - If it's another DEPS entry (not shared with Chromium), roll it to HEAD
+       unless it's configured to be skipped.
+
+    Returns:
+      A list of ChangedDep objects representing the changed deps.
+    """
+  result = []
+  libyuv_entries = BuildDepsentryDict(libyuv_deps)
+  new_cr_entries = BuildDepsentryDict(new_cr_deps)
+  for path, libyuv_deps_entry in libyuv_entries.items():
+    if path in DONT_AUTOROLL_THESE:
+      continue
+    cr_deps_entry = new_cr_entries.get(path)
+    if cr_deps_entry:
+      assert type(cr_deps_entry) is type(libyuv_deps_entry)
+
+      if isinstance(cr_deps_entry, CipdDepsEntry):
+        result.extend(
+            _FindChangedCipdPackages(path, libyuv_deps_entry.packages,
+                                     cr_deps_entry.packages))
+        continue
+
+      if isinstance(cr_deps_entry, VersionEntry):
+        result.extend(
+            _FindChangedVars(path, libyuv_deps_entry.version,
+                             cr_deps_entry.version))
+        continue
+
+      # Use the revision from Chromium's DEPS file.
+      new_rev = cr_deps_entry.revision
+      assert libyuv_deps_entry.url == cr_deps_entry.url, (
+          'LibYUV DEPS entry %s has a different URL %s than Chromium %s.' %
+          (path, libyuv_deps_entry.url, cr_deps_entry.url))
+    else:
+      if isinstance(libyuv_deps_entry, DepsEntry):
+        # Use the HEAD of the deps repo.
+        stdout, _ = _RunCommand(
+            ['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD'])
+        new_rev = stdout.strip().split('\t')[0]
+      else:
+        # The dependency has been removed from chromium.
+        # This is handled by FindRemovedDeps.
+        continue
+
+    # Check if an update is necessary.
+    if libyuv_deps_entry.revision != new_rev:
+      logging.debug('Roll dependency %s to %s', path, new_rev)
+      result.append(
+          ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision,
+                     new_rev))
+  return sorted(result)
+
+
+def CalculateChangedClang(new_cr_rev):
+
+  def GetClangRev(lines):
+    for line in lines:
+      match = CLANG_REVISION_RE.match(line)
+      if match:
+        return match.group(1)
+    raise RollError('Could not parse Clang revision!')
+
+  with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f:
+    current_lines = f.readlines()
+  current_rev = GetClangRev(current_lines)
+
+  new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH,
+                                         new_cr_rev).splitlines()
+  new_rev = GetClangRev(new_clang_update_py)
+  return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev)
+
+
+def GenerateCommitMessage(
+        rev_update,
+        current_commit_pos,
+        new_commit_pos,
+        changed_deps_list,
+        added_deps_paths=None,
+        removed_deps_paths=None,
+        clang_change=None,
+):
+  current_cr_rev = rev_update.current_chromium_rev[0:10]
+  new_cr_rev = rev_update.new_chromium_rev[0:10]
+  rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev)
+  git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos)
+
+  commit_msg = [
+      'Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval),
+      'Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval),
+      'Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval)
+  ]
+
+  def Section(adjective, deps):
+    noun = 'dependency' if len(deps) == 1 else 'dependencies'
+    commit_msg.append('%s %s' % (adjective, noun))
+
+  if changed_deps_list:
+    Section('Changed', changed_deps_list)
+
+    for c in changed_deps_list:
+      if isinstance(c, ChangedCipdPackage):
+        commit_msg.append('* %s: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      elif isinstance(c, ChangedVersionEntry):
+        commit_msg.append('* %s_vesion: %s..%s' %
+                          (c.path, c.current_version, c.new_version))
+      else:
+        commit_msg.append('* %s: %s/+log/%s..%s' %
+                          (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10]))
+
+  if added_deps_paths:
+    Section('Added', added_deps_paths)
+    commit_msg.extend('* %s' % p for p in added_deps_paths)
+
+  if removed_deps_paths:
+    Section('Removed', removed_deps_paths)
+    commit_msg.extend('* %s' % p for p in removed_deps_paths)
+
+  if any([changed_deps_list, added_deps_paths, removed_deps_paths]):
+    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS')
+    commit_msg.append('DEPS diff: %s\n' % change_url)
+  else:
+    commit_msg.append('No dependencies changed.')
+
+  if clang_change and clang_change.current_rev != clang_change.new_rev:
+    commit_msg.append('Clang version changed %s:%s' %
+                      (clang_change.current_rev, clang_change.new_rev))
+    change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval,
+                                           CLANG_UPDATE_SCRIPT_URL_PATH)
+    commit_msg.append('Details: %s\n' % change_url)
+  else:
+    commit_msg.append('No update to Clang.\n')
+
+  commit_msg.append('BUG=None')
+  return '\n'.join(commit_msg)
+
+
+def UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content):
+  """Update the DEPS file with the new revision."""
+
+  with open(deps_filename, 'rb') as deps_file:
+    deps_content = deps_file.read().decode('utf-8')
+
+  # Update the chromium_revision variable.
+  deps_content = deps_content.replace(rev_update.current_chromium_rev,
+                                      rev_update.new_chromium_rev)
+
+  # Add and remove dependencies. For now: only generated android deps.
+  # Since gclient cannot add or remove deps, we on the fact that
+  # these android deps are located in one place we can copy/paste.
+  deps_re = re.compile(ANDROID_DEPS_START + '.*' + ANDROID_DEPS_END, re.DOTALL)
+  new_deps = deps_re.search(new_cr_content)
+  old_deps = deps_re.search(deps_content)
+  if not new_deps or not old_deps:
+    faulty = 'Chromium' if not new_deps else 'LibYUV'
+    raise RollError('Was expecting to find "%s" and "%s"\n'
+                    'in %s DEPS' %
+                    (ANDROID_DEPS_START, ANDROID_DEPS_END, faulty))
+  deps_content = deps_re.sub(new_deps.group(0), deps_content)
+
+  for dep in changed_deps:
+    if isinstance(dep, ChangedVersionEntry):
+      deps_content = deps_content.replace(dep.current_version, dep.new_version)
+
+  with open(deps_filename, 'wb') as deps_file:
+    deps_file.write(deps_content.encode('utf-8'))
+
+  # Update each individual DEPS entry.
+  for dep in changed_deps:
+    # ChangedVersionEntry types are already been processed.
+    if isinstance(dep, ChangedVersionEntry):
+      continue
+    local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
+    if not os.path.isdir(local_dep_dir):
+      raise RollError(
+          'Cannot find local directory %s. Either run\n'
+          'gclient sync --deps=all\n'
+          'or make sure the .gclient file for your solution contains all '
+          'platforms in the target_os list, i.e.\n'
+          'target_os = ["android", "unix", "mac", "ios", "win"];\n'
+          'Then run "gclient sync" again.' % local_dep_dir)
+    if isinstance(dep, ChangedCipdPackage):
+      package = dep.package.format()  # Eliminate double curly brackets
+      update = '%s:%s@%s' % (dep.path, package, dep.new_version)
+    else:
+      update = '%s@%s' % (dep.path, dep.new_rev)
+    _RunCommand(['gclient', 'setdep', '--revision', update],
+                working_dir=CHECKOUT_SRC_DIR)
+
+
+def _IsTreeClean():
+  stdout, _ = _RunCommand(['git', 'status', '--porcelain'])
+  if len(stdout) == 0:
+    return True
+
+  logging.error('Dirty/unversioned files:\n%s', stdout)
+  return False
+
+
+def _EnsureUpdatedMainBranch(dry_run):
+  current_branch = _RunCommand(['git', 'rev-parse', '--abbrev-ref',
+                                'HEAD'])[0].splitlines()[0]
+  if current_branch != 'main':
+    logging.error('Please checkout the main branch and re-run this script.')
+    if not dry_run:
+      sys.exit(-1)
+
+  logging.info('Updating main branch...')
+  _RunCommand(['git', 'pull'])
+
+
+def _CreateRollBranch(dry_run):
+  logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME)
+  if not dry_run:
+    _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME])
+
+
+def _RemovePreviousRollBranch(dry_run):
+  active_branch, branches = _GetBranches()
+  if active_branch == ROLL_BRANCH_NAME:
+    active_branch = 'main'
+  if ROLL_BRANCH_NAME in branches:
+    logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME)
+    if not dry_run:
+      _RunCommand(['git', 'checkout', active_branch])
+      _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME])
+
+
+def _LocalCommit(commit_msg, dry_run):
+  logging.info('Committing changes locally.')
+  if not dry_run:
+    _RunCommand(['git', 'add', '--update', '.'])
+    _RunCommand(['git', 'commit', '-m', commit_msg])
+
+
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+  if skip_cq:
+    return 0
+  if (new_commit_pos - current_commit_pos) < cq_over:
+    return 1
+  return 2
+
+
+def _GetCcRecipients(changed_deps_list):
+  """Returns a list of emails to notify based on the changed deps list.
+    """
+  cc_recipients = []
+  for c in changed_deps_list:
+    pass
+  return cc_recipients
+
+
+def _UploadCL(commit_queue_mode, add_cc=None):
+  """Upload the committed changes as a changelist to Gerrit.
+
+    commit_queue_mode:
+     - 2: Submit to commit queue.
+     - 1: Run trybots but do not submit to CQ.
+     - 0: Skip CQ, upload only.
+
+    add_cc: A list of email addresses to add as CC recipients.
+    """
+  cc_recipients = []
+  if add_cc:
+    cc_recipients.extend(add_cc)
+  cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks']
+  if commit_queue_mode >= 2:
+    logging.info('Sending the CL to the CQ...')
+    cmd.extend(['-o', 'label=Bot-Commit+1'])
+    cmd.extend(['-o', 'label=Commit-Queue+2'])
+    cmd.extend(['--send-mail', '--cc', ','.join(cc_recipients)])
+  elif commit_queue_mode >= 1:
+    logging.info('Starting CQ dry run...')
+    cmd.extend(['-o', 'label=Commit-Queue+1'])
+  extra_env = {
+      'EDITOR': 'true',
+      'SKIP_GCE_AUTH_FOR_GIT': '1',
+  }
+  stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+  logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+                stdout, stderr)
+
+
+def GetRollRevisionRanges(opts, libyuv_deps):
+  current_cr_rev = libyuv_deps['vars']['chromium_revision']
+  new_cr_rev = opts.revision
+  if not new_cr_rev:
+    stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD'])
+    head_rev = stdout.strip().split('\t')[0]
+    logging.info('No revision specified. Using HEAD: %s', head_rev)
+    new_cr_rev = head_rev
+
+  return ChromiumRevisionUpdate(current_cr_rev, new_cr_rev)
+
+
+def main():
+  p = argparse.ArgumentParser()
+  p.add_argument('--clean',
+                 action='store_true',
+                 default=False,
+                 help='Removes any previous local roll branch.')
+  p.add_argument('-r',
+                 '--revision',
+                 help=('Chromium Git revision to roll to. Defaults to the '
+                       'Chromium HEAD revision if omitted.'))
+  p.add_argument('--dry-run',
+                 action='store_true',
+                 default=False,
+                 help=('Calculate changes and modify DEPS, but don\'t create '
+                       'any local branch, commit, upload CL or send any '
+                       'tryjobs.'))
+  p.add_argument('-i',
+                 '--ignore-unclean-workdir',
+                 action='store_true',
+                 default=False,
+                 help=('Ignore if the current branch is not main or if there '
+                       'are uncommitted changes (default: %(default)s).'))
+  grp = p.add_mutually_exclusive_group()
+  grp.add_argument('--skip-cq',
+                   action='store_true',
+                   default=False,
+                   help='Skip sending the CL to the CQ (default: %(default)s)')
+  grp.add_argument('--cq-over',
+                   type=int,
+                   default=1,
+                   help=('Commit queue dry run if the revision difference '
+                         'is below this number (default: %(default)s)'))
+  p.add_argument('-v',
+                 '--verbose',
+                 action='store_true',
+                 default=False,
+                 help='Be extra verbose in printing of log messages.')
+  opts = p.parse_args()
+
+  if opts.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+  else:
+    logging.basicConfig(level=logging.INFO)
+
+  if not opts.ignore_unclean_workdir and not _IsTreeClean():
+    logging.error('Please clean your local checkout first.')
+    return 1
+
+  if opts.clean:
+    _RemovePreviousRollBranch(opts.dry_run)
+
+  if not opts.ignore_unclean_workdir:
+    _EnsureUpdatedMainBranch(opts.dry_run)
+
+  deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS')
+  libyuv_deps = ParseLocalDepsFile(deps_filename)
+
+  rev_update = GetRollRevisionRanges(opts, libyuv_deps)
+
+  current_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.current_chromium_rev))
+  new_commit_pos = ParseCommitPosition(
+      ReadRemoteCrCommit(rev_update.new_chromium_rev))
+
+  new_cr_content = ReadRemoteCrFile('DEPS', rev_update.new_chromium_rev)
+  new_cr_deps = ParseDepsDict(new_cr_content)
+  changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
+  # Discard other deps, assumed to be chromium-only dependencies.
+  new_generated_android_deps, _ = FindAddedDeps(libyuv_deps, new_cr_deps)
+  removed_generated_android_deps, other_deps = FindRemovedDeps(
+      libyuv_deps, new_cr_deps)
+  if other_deps:
+    raise RollError('LibYUV DEPS entries are missing from Chromium: %s.\n'
+                    'Remove them or add them to either '
+                    'LIBYUV_ONLY_DEPS or DONT_AUTOROLL_THESE.' % other_deps)
+  clang_change = CalculateChangedClang(rev_update.new_chromium_rev)
+  commit_msg = GenerateCommitMessage(
+      rev_update,
+      current_commit_pos,
+      new_commit_pos,
+      changed_deps,
+      added_deps_paths=new_generated_android_deps,
+      removed_deps_paths=removed_generated_android_deps,
+      clang_change=clang_change)
+  logging.debug('Commit message:\n%s', commit_msg)
+
+  _CreateRollBranch(opts.dry_run)
+  if not opts.dry_run:
+    UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content)
+  if _IsTreeClean():
+    logging.info("No DEPS changes detected, skipping CL creation.")
+  else:
+    _LocalCommit(commit_msg, opts.dry_run)
+    commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+                                     current_commit_pos, new_commit_pos)
+    logging.info('Uploading CL...')
+    if not opts.dry_run:
+      _UploadCL(commit_queue_mode, _GetCcRecipients(changed_deps))
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/tools_libyuv/autoroller/unittests/roll_deps_test.py
index 477b6e40..af86bdd5 100755
--- a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
+++ b/tools_libyuv/autoroller/unittests/roll_deps_test.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env vpython3
+
 # Copyright 2017 The LibYuv Project Authors. All rights reserved.
 #
 # Use of this source code is governed by a BSD-style license
@@ -14,14 +15,13 @@ import sys
 import tempfile
 import unittest
 
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
-sys.path.append(PARENT_DIR)
 import roll_deps
 from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \
   ParseDepsDict, ParseLocalDepsFile, UpdateDepsFile
 
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir)
+sys.path.append(PARENT_DIR)
 
 TEST_DATA_VARS = {
   'chromium_git': 'https://chromium.googlesource.com',
@@ -45,7 +45,7 @@ class TestError(Exception):
   pass
 
 
-class FakeCmd(object):
+class FakeCmd():
   def __init__(self):
     self.expectations = []
 
@@ -85,43 +85,43 @@ class TestRollChromiumRevision(unittest.TestCase):
   def testVarLookup(self):
     local_scope = {'foo': 'wrong', 'vars': {'foo': 'bar'}}
     lookup = roll_deps.VarLookup(local_scope)
-    self.assertEquals(lookup('foo'), 'bar')
+    self.assertEqual(lookup('foo'), 'bar')
 
   def testUpdateDepsFile(self):
     new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111'
 
     current_rev = TEST_DATA_VARS['chromium_revision']
     UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, [])
-    with open(self._libyuv_depsfile) as deps_file:
+    with open(self._libyuv_depsfile, 'r') as deps_file:
       deps_contents = deps_file.read()
       self.assertTrue(new_rev in deps_contents,
                       'Failed to find %s in\n%s' % (new_rev, deps_contents))
 
   def testParseDepsDict(self):
-    with open(self._libyuv_depsfile) as deps_file:
+    with open(self._libyuv_depsfile, 'r') as deps_file:
       deps_contents = deps_file.read()
     local_scope = ParseDepsDict(deps_contents)
     vars_dict = local_scope['vars']
 
     def assertVar(variable_name):
-      self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
+      self.assertEqual(vars_dict[variable_name], TEST_DATA_VARS[variable_name])
     assertVar('chromium_git')
     assertVar('chromium_revision')
-    self.assertEquals(len(local_scope['deps']), 3)
+    self.assertEqual(len(local_scope['deps']), 3)
 
   def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self):
     entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest')
-    self.assertEquals(len(entries), 1)
-    self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest'])
+    self.assertEqual(len(entries), 1)
+    self.assertEqual(entries[0], DEPS_ENTRIES['src/testing/gtest'])
 
   def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self):
     entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing')
-    self.assertEquals(len(entries), 2)
+    self.assertEqual(len(entries), 2)
 
   def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self):
     entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build')
-    self.assertEquals(len(entries), 1)
-    self.assertEquals(entries[0], DEPS_ENTRIES['src/build'])
+    self.assertEqual(len(entries), 1)
+    self.assertEqual(entries[0], DEPS_ENTRIES['src/build'])
 
   def testCalculateChangedDeps(self):
     _SetupGitLsRemoteCall(self.fake,
@@ -129,14 +129,14 @@ class TestRollChromiumRevision(unittest.TestCase):
     libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile)
     new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile)
     changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps)
-    self.assertEquals(len(changed_deps), 2)
-    self.assertEquals(changed_deps[0].path, 'src/build')
-    self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV)
-    self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV)
-
-    self.assertEquals(changed_deps[1].path, 'src/buildtools')
-    self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
-    self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
+    self.assertEqual(len(changed_deps), 2)
+    self.assertEqual(changed_deps[0].path, 'src/build')
+    self.assertEqual(changed_deps[0].current_rev, BUILD_OLD_REV)
+    self.assertEqual(changed_deps[0].new_rev, BUILD_NEW_REV)
+
+    self.assertEqual(changed_deps[1].path, 'src/buildtools')
+    self.assertEqual(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV)
+    self.assertEqual(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV)
 
 
 def _SetupGitLsRemoteCall(cmd_fake, url, revision):
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS b/tools_libyuv/autoroller/unittests/testdata/DEPS
index 9fbb48a7..4f45860c 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS
@@ -3,6 +3,7 @@
 vars = {
   'chromium_git': 'https://chromium.googlesource.com',
   'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d',
+  'ignored_str': Str(''),
 }
 
 deps = {
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
index d53083ce..d53083ce 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new
diff --git a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
index dd6ddaec..dd6ddaec 100644
--- a/files/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
+++ b/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old
diff --git a/files/tools_libyuv/get_landmines.py b/tools_libyuv/get_landmines.py
index c554f04a..8b33483e 100755
--- a/files/tools_libyuv/get_landmines.py
+++ b/tools_libyuv/get_landmines.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
 # Copyright 2016 The LibYuv Project Authors. All rights reserved.
 #
 # Use of this source code is governed by a BSD-style license
@@ -25,8 +26,8 @@ def print_landmines():
   # dependency problems, fix the dependency problems instead of adding a
   # landmine.
   # See the Chromium version in src/build/get_landmines.py for usage examples.
-  print 'Clobber to remove GYP artifacts after switching bots to GN.'
-  print 'Another try to remove GYP artifacts after switching bots to GN.'
+  print('Clobber to remove GYP artifacts after switching bots to GN.')
+  print('Another try to remove GYP artifacts after switching bots to GN.')
 
 
 def main():
diff --git a/tools_libyuv/msan/OWNERS b/tools_libyuv/msan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/msan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/msan/blacklist.txt b/tools_libyuv/msan/blacklist.txt
index 8b5e42a7..8b5e42a7 100644
--- a/files/tools_libyuv/msan/blacklist.txt
+++ b/tools_libyuv/msan/blacklist.txt
diff --git a/tools_libyuv/ubsan/OWNERS b/tools_libyuv/ubsan/OWNERS
new file mode 100644
index 00000000..9b67a8f6
--- /dev/null
+++ b/tools_libyuv/ubsan/OWNERS
@@ -0,0 +1,3 @@
+mbonadei@chromium.org
+fbarchard@chromium.org
+pbos@chromium.org
diff --git a/files/tools_libyuv/ubsan/blacklist.txt b/tools_libyuv/ubsan/blacklist.txt
index 8bcb2907..8bcb2907 100644
--- a/files/tools_libyuv/ubsan/blacklist.txt
+++ b/tools_libyuv/ubsan/blacklist.txt
diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/tools_libyuv/ubsan/vptr_blacklist.txt
index 23cfca53..23cfca53 100644
--- a/files/tools_libyuv/ubsan/vptr_blacklist.txt
+++ b/tools_libyuv/ubsan/vptr_blacklist.txt
diff --git a/files/unit_test/basictypes_test.cc b/unit_test/basictypes_test.cc
index 9aaa2dcd..9aaa2dcd 100644
--- a/files/unit_test/basictypes_test.cc
+++ b/unit_test/basictypes_test.cc
diff --git a/files/unit_test/color_test.cc b/unit_test/color_test.cc
index 4bb448d5..01267ff1 100644
--- a/files/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -20,20 +20,22 @@
 
 namespace libyuv {
 
-// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define ERROR_R 1
-#define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 6
-#define ERROR_J420 5
+// TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
+// Port to Visual C and other CPUs
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__))
+#define ERROR_FULL 5
+#define ERROR_J420 4
 #else
+#define ERROR_FULL 6
+#define ERROR_J420 6
+#endif
 #define ERROR_R 1
 #define ERROR_G 1
-#define ERROR_B 3
-#define ERROR_FULL 5
-#define ERROR_J420 3
+#ifdef LIBYUV_UNLIMITED_DATA
+#define ERROR_B 1
+#else
+#define ERROR_B 18
 #endif
 
 #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF)              \
@@ -187,6 +189,104 @@ static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
   *r = orig_pixels[2];
 }
 
+static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+#define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+
+static void YUVFToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  F422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
 static void YToRGB(int y, int* r, int* g, int* b) {
   const int kWidth = 16;
   const int kHeight = 1;
@@ -335,18 +435,50 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
   EXPECT_LE(allb, 255);
 }
 
+// BT.601 limited range YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
   *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
   *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }
 
+// BT.601 full range YUV to RGB reference (aka JPEG)
 static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   *r = RoundToByte(y - (v - 128) * -1.40200);
   *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
   *b = RoundToByte(y - (u - 128) * -1.77200);
 }
 
+// BT.709 limited range YUV to RGB reference
+// See also http://www.equasys.de/colorconversion.html
+static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
+}
+
+// BT.709 full range YUV to RGB reference
+static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y - (v - 128) * -1.5748);
+  *g = RoundToByte(y - (u - 128) * 0.18732 - (v - 128) * 0.46812);
+  *b = RoundToByte(y - (u - 128) * -1.8556);
+}
+
+// BT.2020 limited range YUV to RGB reference
+static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+  *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+                   (v - 128) * 0.65042);
+  *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
+}
+
+// BT.2020 full range YUV to RGB reference
+static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y + (v - 128) * 1.474600);
+  *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
+  *b = RoundToByte(y + (u - 128) * 1.881400);
+}
+
 TEST_F(LibYUVColorTest, TestYUV) {
   int r0, g0, b0, r1, g1, b1;
 
@@ -370,7 +502,11 @@ TEST_F(LibYUVColorTest, TestYUV) {
   YUVToRGB(240, 0, 0, &r1, &g1, &b1);
   EXPECT_EQ(57, r1);
   EXPECT_EQ(255, g1);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(3, b1);
+#else
   EXPECT_EQ(5, b1);
+#endif
 
   for (int i = 0; i < 256; ++i) {
     YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
@@ -444,28 +580,28 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {
 
 static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
   int i;
-  printf("hist");
+  printf("hist ");
   for (i = 0; i < 256; ++i) {
     if (rh[i] || gh[i] || bh[i]) {
-      printf("\t%8d", i - 128);
+      printf(" %8d", i - 128);
     }
   }
-  printf("\nred");
+  printf("\nred  ");
   for (i = 0; i < 256; ++i) {
     if (rh[i] || gh[i] || bh[i]) {
-      printf("\t%8d", rh[i]);
+      printf(" %8d", rh[i]);
     }
   }
   printf("\ngreen");
   for (i = 0; i < 256; ++i) {
     if (rh[i] || gh[i] || bh[i]) {
-      printf("\t%8d", gh[i]);
+      printf(" %8d", gh[i]);
     }
   }
-  printf("\nblue");
+  printf("\nblue ");
   for (i = 0; i < 256; ++i) {
     if (rh[i] || gh[i] || bh[i]) {
-      printf("\t%8d", bh[i]);
+      printf(" %8d", bh[i]);
     }
   }
   printf("\n");
@@ -473,7 +609,13 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 
 // Step by 5 on inner loop goes from 0 to 255 inclusive.
 // Set to 1 for better converage.  3, 5 or 17 for faster testing.
+#ifdef DISABLE_SLOW_TESTS
 #define FASTSTEP 5
+#else
+#define FASTSTEP 1
+#endif
+
+// BT.601 limited range.
 TEST_F(LibYUVColorTest, TestFullYUV) {
   int rh[256] = {
       0,
@@ -503,6 +645,7 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
   PrintHistogram(rh, gh, bh);
 }
 
+// BT.601 full range.
 TEST_F(LibYUVColorTest, TestFullYUVJ) {
   int rh[256] = {
       0,
@@ -520,9 +663,129 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
         int y = RANDOM256(y2);
         YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVJToRGB(y, u, v, &r1, &g1, &b1);
-        EXPECT_NEAR(r0, r1, 1);
-        EXPECT_NEAR(g0, g1, 1);
-        EXPECT_NEAR(b0, b1, 1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+// BT.709 limited range.
+TEST_F(LibYUVColorTest, TestFullYUVH) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVHToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+// BT.709 full range.
+TEST_F(LibYUVColorTest, TestFullYUVF) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVFToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVFToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+// BT.2020 limited range.
+TEST_F(LibYUVColorTest, TestFullYUVU) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVUToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVUToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+// BT.2020 full range.
+TEST_F(LibYUVColorTest, TestFullYUVV) {
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVVToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, 2);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
diff --git a/files/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 136254e1..c29562cb 100644
--- a/files/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -15,10 +15,13 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
-#include "libyuv/compare_row.h" /* For HammingDistance_C */
 #include "libyuv/cpu_id.h"
 #include "libyuv/video_common.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#endif
+
 namespace libyuv {
 
 // hash seed of 5381 recommended.
@@ -206,6 +209,7 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
   free_aligned_buffer_page_end(src_a);
 }
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
   const int kMaxWidth = 4096 * 3;
   align_buffer_page_end(src_a, kMaxWidth);
@@ -340,7 +344,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64;  // 536870848
 
 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   uint32_t h1 = 0;
-  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 63) & ~63;
   align_buffer_page_end(src_a, kMaxWidth);
   align_buffer_page_end(src_b, kMaxWidth);
   memset(src_a, 255u, kMaxWidth);
@@ -403,6 +407,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
 }
+#endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVCompareTest, TestHammingDistance) {
   align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
new file mode 100644
index 00000000..aeee8a7f
--- /dev/null
+++ b/unit_test/convert_argb_test.cc
@@ -0,0 +1,2700 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
+#define LITTLE_ENDIAN_ONLY_TEST 1
+#endif
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
+// subsample amount uses a divide.
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+#define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, W1280, N, NEG, OFF, SRC_DEPTH, TILE_WIDTH,   \
+                   TILE_HEIGHT)                                                \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
+                  "SRC_SUBSAMP_X unsupported");                                \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
+                  "SRC_SUBSAMP_Y unsupported");                                \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
+                  "DST_SUBSAMP_X unsupported");                                \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
+                  "DST_SUBSAMP_Y unsupported");                                \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1);  \
+    const int kPaddedHeight =                                                  \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
+    align_buffer_page_end(                                                     \
+        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
+    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
+      src_y_p[i] =                                                             \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
+    }                                                                          \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
+      src_uv_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
+    }                                                                          \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
+        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
+        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                             \
+        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                      \
+        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,              \
+        NEG kHeight);                                                          \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
+          src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                        \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                         \
+          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                  \
+          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,          \
+          NEG kHeight);                                                        \
+    }                                                                          \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+    }                                                                          \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {       \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_u_c);                                     \
+    free_aligned_buffer_page_end(dst_v_c);                                     \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_u_opt);                                   \
+    free_aligned_buffer_page_end(dst_v_opt);                                   \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, TILE_WIDTH,       \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2, SRC_DEPTH, TILE_WIDTH,     \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0, SRC_DEPTH, TILE_WIDTH,        \
+             TILE_HEIGHT)                                                   \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#else
+#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+             FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
+#endif
+
+TESTBPTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOP(P010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10, 1, 1)
+TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
+
+// Provide matrix wrappers for full range bt.709
+#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+#define F422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+#define F444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
+#define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
+
+// Provide matrix wrappers for full range bt.2020
+#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+#define I420ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+#define I422ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
+  I422ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+#define I420ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
+#define I422ToRGB24Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                          kFilterBilinear)
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, W1280, N, NEG, OFF)                            \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y[i + OFF] = (fastrand() & 0xff);                                   \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      src_u[i + OFF] = (fastrand() & 0xff);                                   \
+      src_v[i + OFF] = (fastrand() & 0xff);                                   \
+    }                                                                         \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                          \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                      \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    double time0 = get_time();                                                \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
+                          src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
+                          kWidth, NEG kHeight);                               \
+    double time1 = get_time();                                                \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
+                            src_v + OFF, kStrideUV, dst_argb_opt + OFF,       \
+                            kStrideB, kWidth, NEG kHeight);                   \
+    }                                                                         \
+    double time2 = get_time();                                                \
+    printf(" %8d us C - %8d us OPT\n",                                        \
+           static_cast<int>((time1 - time0) * 1e6),                           \
+           static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_));  \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                      YALIGN)                                                \
+  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                 YALIGN, benchmark_width_ + 1, _Any, +, 0)                   \
+  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                 YALIGN, benchmark_width_, _Unaligned, +, 4)                 \
+  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                 YALIGN, benchmark_width_, _Invert, -, 0)                    \
+  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                 YALIGN, benchmark_width_, _Opt, +, 0)
+#else
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                      YALIGN)                                                \
+  TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                 YALIGN, benchmark_width_, _Opt, +, 0)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(F420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(F420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I422, 1, 1, RAW, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, RGB24, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, RAW, 3, 3, 1)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, AB30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1)
+#endif
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I422, 2, 2, RGB24Filter, 3, 3, 1)
+#else  // FULL_TESTS
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
+#endif
+
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   W1280, N, NEG, OFF)                                         \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kStrideB = kWidth * BPP_B;                                       \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_uv,                                              \
+                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kWidth; ++j)                                         \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
+        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
+      }                                                                        \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
+    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
+                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
+                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
+    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
+    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
+    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
+    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+                  kHeight);                                                    \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+                  kHeight);                                                    \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth * 4; ++j) {                                   \
+        EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb32_c);                                \
+    free_aligned_buffer_page_end(dst_argb32_opt);                              \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Opt, +, 0)
+#else
+#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \
+  TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+             benchmark_width_, _Opt, +, 0)
+#endif
+
+#define JNV12ToARGB(a, b, c, d, e, f, g, h) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToARGB(a, b, c, d, e, f, g, h) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToABGR(a, b, c, d, e, f, g, h) \
+  NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToABGR(a, b, c, d, e, f, g, h) \
+  NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB24(a, b, c, d, e, f, g, h) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV21ToRGB24(a, b, c, d, e, f, g, h) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+#define JNV12ToRAW(a, b, c, d, e, f, g, h) \
+  NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV21ToRAW(a, b, c, d, e, f, g, h) \
+  NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h)
+#define JNV12ToRGB565(a, b, c, d, e, f, g, h) \
+  NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h)
+
+TESTBPTOB(JNV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(JNV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(JNV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(JNV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(JNV21, 2, 2, RAW, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBPTOB(JNV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+TESTBPTOB(NV12, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV21, 2, 2, ARGB, ARGB, 4)
+TESTBPTOB(NV12, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV21, 2, 2, ABGR, ABGR, 4)
+TESTBPTOB(NV12, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV21, 2, 2, RGB24, RGB24, 3)
+TESTBPTOB(NV12, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, RAW, RAW, 3)
+TESTBPTOB(NV21, 2, 2, YUV24, RAW, 3)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
+#endif
+
+#define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,     \
+                  EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)               \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA =                                                       \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
+    const int kStrideB =                                                       \
+        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
+    align_buffer_page_end(src_argb,                                            \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+    align_buffer_page_end(dst_argb_opt,                                        \
+                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
+                     kStrideB, kWidth, NEG kHeight);                           \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA,                    \
+                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);  \
+    }                                                                          \
+    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {      \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
+    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kStrideA =                                                     \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+      const int kStrideB =                                                     \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+      align_buffer_page_end(dst_argb_c,                                        \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_opt,                                      \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
+        src_argb[i] = 0xfe;                                                    \
+      }                                                                        \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      MaskCpuFlags(disable_cpu_flags_);                                        \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
+                       kStrideB, kWidth, kHeight);                             \
+      MaskCpuFlags(benchmark_cpu_info_);                                       \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
+                       kStrideB, kWidth, kHeight);                             \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      }                                                                        \
+      free_aligned_buffer_page_end(src_argb);                                  \
+      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(dst_argb_opt);                              \
+    }                                                                          \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_ + 1, _Any, +, 0)           \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 4)         \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0)            \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0)               \
+  TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)
+#else
+#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
+// in place test
+#define TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,    \
+                  EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF)              \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                            \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;      \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;      \
+    const int kStrideA =                                                      \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
+    const int kStrideB =                                                      \
+        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
+    align_buffer_page_end(src_argb,                                           \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_c,                                         \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    align_buffer_page_end(dst_argb_opt,                                       \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                \
+    }                                                                         \
+    memcpy(dst_argb_c + OFF, src_argb,                                        \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_c /* src */ + OFF), kStrideA,         \
+                     (TYPE_B*)dst_argb_c, kStrideB, kWidth, NEG kHeight);     \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,     \
+                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \
+    }                                                                         \
+    memcpy(dst_argb_opt + OFF, src_argb,                                      \
+           kStrideA * kHeightA * (int)sizeof(TYPE_A));                        \
+    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,       \
+                     (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);   \
+    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {     \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_argb);                                   \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#define TESTATOA(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B,   \
+                 EPP_B, STRIDE_B, HEIGHT_B)                                 \
+  TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \
+            STRIDE_B, HEIGHT_B, benchmark_width_, _Inplace, +, 0)
+
+TESTATOA(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+TESTATOA(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1)
+// TODO(fbarchard): Support in place for mirror.
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1)
+TESTATOA(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1)
+#endif
+TESTATOA(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1)
+TESTATOA(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1)
+// TODO(fbarchard): Support in place for conversions that increase bpp.
+// TESTATOA(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1)
+// TESTATOA(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1)
+// TESTATOA(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+// TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1)
+TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1)
+#endif
+TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+// TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+// TESTATOA(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1)
+TESTATOA(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1)
+TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
+
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                   HEIGHT_B, W1280, N, NEG, OFF)                             \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                   \
+    const int kWidth = W1280;                                                \
+    const int kHeight = benchmark_height_;                                   \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+    const int kStrideA =                                                     \
+        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+    const int kStrideB =                                                     \
+        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
+      src_argb[i + OFF] = (fastrand() & 0xff);                               \
+    }                                                                        \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
+    MaskCpuFlags(disable_cpu_flags_);                                        \
+    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
+                             NULL, kWidth, NEG kHeight);                     \
+    MaskCpuFlags(benchmark_cpu_info_);                                       \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                        \
+      FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt,       \
+                               kStrideB, NULL, kWidth, NEG kHeight);         \
+    }                                                                        \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+    }                                                                        \
+    free_aligned_buffer_page_end(src_argb);                                  \
+    free_aligned_buffer_page_end(dst_argb_c);                                \
+    free_aligned_buffer_page_end(dst_argb_opt);                              \
+  }
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B,        \
+                        STRIDE_B, HEIGHT_B)                                    \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) {                 \
+    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kStrideA =                                                     \
+          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+      const int kStrideB =                                                     \
+          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
+      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+      for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
+        src_argb[i] = (fastrand() & 0xff);                                     \
+      }                                                                        \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      MaskCpuFlags(disable_cpu_flags_);                                        \
+      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
+                               kWidth, kHeight);                               \
+      MaskCpuFlags(benchmark_cpu_info_);                                       \
+      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB,     \
+                               NULL, kWidth, kHeight);                         \
+      for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      }                                                                        \
+      free_aligned_buffer_page_end(src_argb);                                  \
+      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(dst_argb_opt);                              \
+    }                                                                          \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)                                                 \
+  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
+             HEIGHT_B, benchmark_width_ + 1, _Any, +, 0)                    \
+  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
+             HEIGHT_B, benchmark_width_, _Unaligned, +, 2)                  \
+  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
+             HEIGHT_B, benchmark_width_, _Invert, -, 0)                     \
+  TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,      \
+             HEIGHT_B, benchmark_width_, _Opt, +, 0)                        \
+  TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)
+#else
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)                                                 \
+  TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
+                  HEIGHT_B)
+#endif
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
+#endif
+
+// These conversions called twice, produce the original result.
+// e.g. endian swap twice.
+#define TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG,   \
+                 OFF)                                                          \
+  TEST_F(LibYUVConvertTest, FMT_ATOB##_Endswap##N) {                           \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kStrideA =                                                       \
+        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
+    align_buffer_page_end(src_argb,                                            \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+    align_buffer_page_end(dst_argb_opt,                                        \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
+      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
+    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
+             kStrideA, kWidth, NEG kHeight);                                   \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt,     \
+               kStrideA, kWidth, NEG kHeight);                                 \
+    }                                                                          \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA,     \
+             kWidth, NEG kHeight);                                             \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \
+             kWidth, NEG kHeight);                                             \
+    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
+      EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A)                  \
+  TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ + 1, \
+           _Any, +, 0)                                                        \
+  TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_,     \
+           _Unaligned, +, 2)                                                  \
+  TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_,     \
+           _Opt, +, 0)
+#else
+#define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A)              \
+  TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \
+           _Opt, +, 0)
+#endif
+
+TESTEND(ARGBToBGRA, uint8_t, 4, 4, 1)
+TESTEND(ARGBToABGR, uint8_t, 4, 4, 1)
+TESTEND(BGRAToARGB, uint8_t, 4, 4, 1)
+TESTEND(ABGRToARGB, uint8_t, 4, 4, 1)
+TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
+
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                        YALIGN, W1280, N, NEG, OFF, ATTEN)                     \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      src_y[i + OFF] = (fastrand() & 0xff);                                    \
+      src_a[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      src_u[i + OFF] = (fastrand() & 0xff);                                    \
+      src_v[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,         \
+                          src_v + OFF, kStrideUV, src_a + OFF, kWidth,         \
+                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
+                          ATTEN);                                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,       \
+                            src_v + OFF, kStrideUV, src_a + OFF, kWidth,       \
+                            dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \
+                            ATTEN);                                            \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_a);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN)                                                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Unaligned, +, 2, 0)              \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Invert, -, 0, 0)                 \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, 0)                    \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Premult, +, 0, 1)
+#else
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN)                                                \
+  TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#endif
+
+#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+
+#define I420AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I420AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
+#define I422AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I422AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
+
+#if defined(ENABLE_FULL_TESTS)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#else
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1)
+TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1)
+#endif
+
+TEST_F(LibYUVConvertTest, TestYToARGB) {
+  uint8_t y[32];
+  uint8_t expectedg[32];
+  for (int i = 0; i < 32; ++i) {
+    y[i] = i * 5 + 17;
+    expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
+  }
+  uint8_t argb[32 * 4];
+  YToARGB(y, 0, argb, 0, 32, 1);
+
+  for (int i = 0; i < 32; ++i) {
+    printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
+           argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
+  }
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
+  }
+}
+
+static const uint8_t kNoDither4x4[16] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+TEST_F(LibYUVConvertTest, TestNoDither) {
+  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_rgb565dither,
+                        benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+                     benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
+                     benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+  }
+
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_rgb565);
+  free_aligned_buffer_page_end(dst_rgb565dither);
+}
+
+// Ordered 4x4 dither for 888 to 565.  Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+TEST_F(LibYUVConvertTest, TestDither) {
+  align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_rgb565dither,
+                        benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_argbdither,
+                        benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither,
+                     benchmark_width_ * 2, kDither565_4x4, benchmark_width_,
+                     benchmark_height_);
+  RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4,
+               benchmark_width_, benchmark_height_);
+  RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither,
+               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+  }
+  free_aligned_buffer_page_end(src_argb);
+  free_aligned_buffer_page_end(dst_rgb565);
+  free_aligned_buffer_page_end(dst_rgb565dither);
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(dst_argbdither);
+}
+
+#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                        YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C)              \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      src_y[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      src_u[i + OFF] = (fastrand() & 0xff);                                    \
+      src_v[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+                                  src_v + OFF, kStrideUV, dst_argb_c + OFF,    \
+                                  kStrideB, NULL, kWidth, NEG kHeight);        \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B##Dither(                                           \
+          src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \
+          dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
+    }                                                                          \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
+    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
+    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
+    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
+    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
+    FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
+                     kWidth, kHeight);                                         \
+    FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
+                     kWidth * BPP_C, kWidth, kHeight);                         \
+    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb32_c);                                \
+    free_aligned_buffer_page_end(dst_argb32_opt);                              \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, FMT_C, BPP_C)                                  \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)     \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C)   \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)      \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, FMT_C, BPP_C)                                  \
+  TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                  YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
+#endif
+
+// Transitive test.  A to B to C is same as A to C.
+// Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere.
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                       W1280, N, NEG, OFF, FMT_C, BPP_C)                      \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) {            \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y[i + OFF] = (fastrand() & 0xff);                                   \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      src_u[i + OFF] = (fastrand() & 0xff);                                   \
+      src_v[i + OFF] = (fastrand() & 0xff);                                   \
+    }                                                                         \
+    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,        \
+                          src_v + OFF, kStrideUV, dst_argb_b + OFF, kStrideB, \
+                          kWidth, NEG kHeight);                               \
+    /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
+    const int kStrideC = kWidth * BPP_C;                                      \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
+    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
+    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV,      \
+                            src_v + OFF, kStrideUV, dst_argb_c + OFF,         \
+                            kStrideC, kWidth, NEG kHeight);                   \
+      /* Convert B to C */                                                    \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,         \
+                       kStrideC, kWidth, kHeight);                            \
+    }                                                                         \
+    for (int i = 0; i < kStrideC * kHeight; ++i) {                            \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_b);                                 \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_bc);                                \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                      FMT_C, BPP_C)                                          \
+  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                 benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)             \
+  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                 benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C)           \
+  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                 benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)              \
+  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                 benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                      FMT_C, BPP_C)                                          \
+  TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                 benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+#endif
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+#else
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+#endif
+
+// Transitive test: Compare 1 step vs 2 step conversion for YUVA to ARGB.
+// Benchmark 2 step conversion for comparison to 1 step conversion.
+#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                        W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN)               \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) {             \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
+    const int kSizeUV =                                                        \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    const int kStrideC = kWidth * BPP_C;                                       \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
+    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      src_y[i + OFF] = (fastrand() & 0xff);                                    \
+      src_a[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      src_u[i + OFF] = (fastrand() & 0xff);                                    \
+      src_v[i + OFF] = (fastrand() & 0xff);                                    \
+    }                                                                          \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      /* Convert A to B */                                                     \
+      FMT_PLANAR##To##FMT_B(                                                   \
+          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),      \
+          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,      \
+          dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);             \
+      /* Convert B to C */                                                     \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,          \
+                       kStrideC, kWidth, kHeight);                             \
+    }                                                                          \
+    /* Convert A to C */                                                       \
+    FMT_PLANAR##To##FMT_C(                                                     \
+        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
+        dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);               \
+    for (int i = 0; i < kStrideC * kHeight; ++i) {                             \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_a);                                       \
+    free_aligned_buffer_page_end(dst_argb_b);                                  \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_bc);                                 \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                       FMT_C, BPP_C)                                          \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C, 0)          \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C, 0)        \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0)           \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)              \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+#else
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                       FMT_C, BPP_C)                                          \
+  TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+                  benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)
+#endif
+
+#if defined(ENABLE_FULL_TESTS)
+TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(F422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(F422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(J444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+#else
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+#endif
+
+#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
+                      OFF, FMT_C, BPP_C)                                       \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##To##FMT_C##N) {                  \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
+    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
+    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
+    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
+    FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
+                     kWidth, NEG kHeight);                                     \
+    /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
+    const int kStrideC = kWidth * BPP_C;                                       \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
+    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \
+                       kWidth, NEG kHeight);                                   \
+      /* Convert B to C */                                                     \
+      FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF,          \
+                       kStrideC, kWidth, kHeight);                             \
+    }                                                                          \
+    for (int i = 0; i < kStrideC * kHeight; i += 4) {                          \
+      EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
+      EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_argb_a);                                  \
+    free_aligned_buffer_page_end(dst_argb_b);                                  \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_bc);                                 \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B,                    \
+                benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C)              \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
+                _Unaligned, +, 4, FMT_C, BPP_C)                              \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
+                _Invert, -, 0, FMT_C, BPP_C)                                 \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
+                _Opt, +, 0, FMT_C, BPP_C)
+#else
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+  TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_,  \
+                _Opt, +, 0, FMT_C, BPP_C)
+#endif
+
+// Caveat: Destination needs to be 4 bytes
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
+TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+#endif
+
+TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
+  // 2x2 frames
+  uint32_t src[4];
+  uint32_t dst[4];
+  // some random input
+  src[0] = 0x11000000;
+  src[1] = 0x00450000;
+  src[2] = 0x00009f00;
+  src[3] = 0x000000ff;
+  // zeros on destination
+  dst[0] = 0x00000000;
+  dst[1] = 0x00000000;
+  dst[2] = 0x00000000;
+  dst[3] = 0x00000000;
+
+  int r = ConvertToARGB(reinterpret_cast<uint8_t*>(src),
+                        16,  // input size
+                        reinterpret_cast<uint8_t*>(dst),
+                        8,  // destination stride
+                        0,  // crop_x
+                        0,  // crop_y
+                        2,  // width
+                        2,  // height
+                        2,  // crop width
+                        2,  // crop height
+                        kRotate90, FOURCC_ARGB);
+
+  EXPECT_EQ(r, 0);
+  // 90 degrees rotation, no conversion
+  EXPECT_EQ(dst[0], src[2]);
+  EXPECT_EQ(dst[1], src[0]);
+  EXPECT_EQ(dst[2], src[3]);
+  EXPECT_EQ(dst[3], src[1]);
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
+  // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels.
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+  align_buffer_page_end(src, kPixels * 4);
+  align_buffer_page_end(dst_opt, kPixels * 4);
+  align_buffer_page_end(dst_c, kPixels * 4);
+  MemRandomize(src, kPixels * 4);
+  memset(dst_opt, 0, kPixels * 4);
+  memset(dst_c, 1, kPixels * 4);
+
+  ARGBToAR30Row_C(src, dst_c, kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
+    } else if (has_ssse3) {
+      ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
+    } else {
+      ARGBToAR30Row_C(src, dst_opt, kPixels);
+    }
+  }
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(dst_c);
+}
+#endif  // HAS_ARGBTOAR30ROW_AVX2
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
+  // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+  align_buffer_page_end(src, kPixels * 4);
+  align_buffer_page_end(dst_opt, kPixels * 4);
+  align_buffer_page_end(dst_c, kPixels * 4);
+  MemRandomize(src, kPixels * 4);
+  memset(dst_opt, 0, kPixels * 4);
+  memset(dst_c, 1, kPixels * 4);
+
+  ABGRToAR30Row_C(src, dst_c, kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
+    } else if (has_ssse3) {
+      ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
+    } else {
+      ABGRToAR30Row_C(src, dst_opt, kPixels);
+    }
+  }
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(dst_c);
+}
+#endif  // HAS_ABGRTOAR30ROW_AVX2
+
+#if !defined(LEAN_TESTS)
+
+// Provide matrix wrappers for 12 bit YUV
+#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I012ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I012ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+
+#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
+#define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j)
+#define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \
+  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
+
+#define I010ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
+  I010ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+#define I010ToAR30Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I010ToAR30MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+#define I210ToARGBFilter(a, b, c, d, e, f, g, h, i, j)                     \
+  I210ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+#define I210ToAR30Filter(a, b, c, d, e, f, g, h, i, j)                     \
+  I210ToAR30MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \
+                         kFilterBilinear)
+
+// TODO(fbarchard): Fix clamping issue affected by U channel.
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                         BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF)     \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    const int kBpc = 2;                                                       \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \
+      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \
+    }                                                                         \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_PLANAR##To##FMT_B(                                                    \
+        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                    \
+        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,                 \
+        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,                 \
+        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                    \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(                                                  \
+          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                  \
+          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,               \
+          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,               \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
+    }                                                                         \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                        BPP_B, ALIGN, YALIGN)                                \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_ + 1, _Any, +, 0, 0)       \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 4, 4)     \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0)        \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#else
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
+                        BPP_B, ALIGN, YALIGN)                                \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
+                   ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
+#endif
+
+// These conversions are only optimized for x86
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGBFilter, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGBFilter, 4, 4, 1)
+
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
+TESTPLANAR16TOB(I012, 2, 2, 0xfff, AB30, 4, 4, 1)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30Filter, 4, 4, 1)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
+#endif  // LITTLE_ENDIAN_ONLY_TEST
+#endif  // DISABLE_SLOW_TESTS
+
+#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
+                          ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH)   \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    const int kBpc = 2;                                                        \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+      reinterpret_cast<uint16_t*>(src_a + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      reinterpret_cast<uint16_t*>(src_u + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+      reinterpret_cast<uint16_t*>(src_v + OFF)[i] =                            \
+          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + OFF), kWidth,    \
+                          reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \
+                          reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \
+                          reinterpret_cast<uint16_t*>(src_a + OFF), kWidth,    \
+                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight,     \
+                          ATTEN);                                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(                                                   \
+          reinterpret_cast<uint16_t*>(src_y + OFF), kWidth,                    \
+          reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV,                 \
+          reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV,                 \
+          reinterpret_cast<uint16_t*>(src_a + OFF), kWidth,                    \
+          dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);           \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(src_a);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
+                         ALIGN, YALIGN, S_DEPTH)                            \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)   \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Unaligned, +, 2, 0, S_DEPTH) \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)    \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)       \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                    YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH)
+#else
+#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,   \
+                         ALIGN, YALIGN, S_DEPTH)                           \
+  TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                    YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#endif
+
+#define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \
+                        l, m)
+#define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
+                        l, m)
+#define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \
+                        l, m)
+#define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \
+                        l, m)
+#define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
+                        l, m)
+#define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
+  I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define I010AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I010AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
+#define I210AlphaToARGBFilter(a, b, c, d, e, f, g, h, i, j, k, l, m) \
+  I010AlphaToARGBMatrixFilter(a, b, c, d, e, f, g, h, i, j,          \
+                              &kYuvI601Constants, k, l, m, kFilterBilinear)
+
+// These conversions are only optimized for x86
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10)
+TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
+#endif  // DISABLE_SLOW_TESTS
+
+#define TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                     YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH)               \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
+    const int kBpc = 2;                                                        \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
+    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
+          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      reinterpret_cast<uint16_t*>(src_uv + SOFF)[i] =                          \
+          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
+    }                                                                          \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                          \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                      \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,   \
+                          reinterpret_cast<uint16_t*>(src_uv + SOFF),          \
+                          kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth,      \
+                          NEG kHeight);                                        \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+                            reinterpret_cast<uint16_t*>(src_uv + SOFF),        \
+                            kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth,  \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                    YALIGN, S_DEPTH)                                          \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH)                  \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH)                \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Invert, -, 0, 0, S_DEPTH)                   \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#else
+#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,    \
+                    YALIGN, S_DEPTH)                                          \
+  TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \
+               benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
+#endif
+
+#define P010ToARGB(a, b, c, d, e, f, g, h) \
+  P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToARGB(a, b, c, d, e, f, g, h) \
+  P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P010ToAR30(a, b, c, d, e, f, g, h) \
+  P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P210ToAR30(a, b, c, d, e, f, g, h) \
+  P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P012ToARGB(a, b, c, d, e, f, g, h) \
+  P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToARGB(a, b, c, d, e, f, g, h) \
+  P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P012ToAR30(a, b, c, d, e, f, g, h) \
+  P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P212ToAR30(a, b, c, d, e, f, g, h) \
+  P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P016ToARGB(a, b, c, d, e, f, g, h) \
+  P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToARGB(a, b, c, d, e, f, g, h) \
+  P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P016ToAR30(a, b, c, d, e, f, g, h) \
+  P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+#define P216ToAR30(a, b, c, d, e, f, g, h) \
+  P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
+
+#define P010ToARGBFilter(a, b, c, d, e, f, g, h)                     \
+  P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+                         kFilterBilinear)
+#define P210ToARGBFilter(a, b, c, d, e, f, g, h)                     \
+  P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+                         kFilterBilinear)
+#define P010ToAR30Filter(a, b, c, d, e, f, g, h)                     \
+  P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+                         kFilterBilinear)
+#define P210ToAR30Filter(a, b, c, d, e, f, g, h)                     \
+  P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
+                         kFilterBilinear)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+TESTBP16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTBP16TOB(P010, 2, 2, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30, 4, 4, 1, 10)
+TESTBP16TOB(P012, 2, 2, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P212, 2, 1, AR30, 4, 4, 1, 12)
+TESTBP16TOB(P016, 2, 2, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P216, 2, 1, AR30, 4, 4, 1, 16)
+TESTBP16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10)
+TESTBP16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10)
+#endif  // LITTLE_ENDIAN_ONLY_TEST
+#endif  // DISABLE_SLOW_TESTS
+
+static int Clamp(int y) {
+  if (y < 0) {
+    y = 0;
+  }
+  if (y > 255) {
+    y = 255;
+  }
+  return y;
+}
+
+static int Clamp10(int y) {
+  if (y < 0) {
+    y = 0;
+  }
+  if (y > 1023) {
+    y = 1023;
+  }
+  return y;
+}
+
+// Test 8 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToARGB) {
+  const int kSize = 256;
+  int histogram_b[256];
+  int histogram_g[256];
+  int histogram_r[256];
+  memset(histogram_b, 0, sizeof(histogram_b));
+  memset(histogram_g, 0, sizeof(histogram_g));
+  memset(histogram_r, 0, sizeof(histogram_r));
+  align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+  align_buffer_page_end(argb_pixels, kSize * 4);
+  uint8_t* orig_y = orig_yuv;
+  uint8_t* orig_u = orig_y + kSize;
+  uint8_t* orig_v = orig_u + kSize / 2;
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_y[i] = i;
+  }
+  for (int i = 0; i < kSize / 2; ++i) {
+    orig_u[i] = 128;  // 128 is 0.
+    orig_v[i] = 128;
+  }
+
+  H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+  for (int i = 0; i < kSize; ++i) {
+    int b = argb_pixels[i * 4 + 0];
+    int g = argb_pixels[i * 4 + 1];
+    int r = argb_pixels[i * 4 + 2];
+    int a = argb_pixels[i * 4 + 3];
+    ++histogram_b[b];
+    ++histogram_g[g];
+    ++histogram_r[r];
+    // Reference formula for Y channel contribution in YUV to RGB conversions:
+    int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f + 0.5f));
+    EXPECT_EQ(b, expected_y);
+    EXPECT_EQ(g, expected_y);
+    EXPECT_EQ(r, expected_y);
+    EXPECT_EQ(a, 255);
+  }
+
+  int count_b = 0;
+  int count_g = 0;
+  int count_r = 0;
+  for (int i = 0; i < kSize; ++i) {
+    if (histogram_b[i]) {
+      ++count_b;
+    }
+    if (histogram_g[i]) {
+      ++count_g;
+    }
+    if (histogram_r[i]) {
+      ++count_r;
+    }
+  }
+  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+  free_aligned_buffer_page_end(orig_yuv);
+  free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH010ToARGB) {
+  const int kSize = 1024;
+  int histogram_b[1024];
+  int histogram_g[1024];
+  int histogram_r[1024];
+  memset(histogram_b, 0, sizeof(histogram_b));
+  memset(histogram_g, 0, sizeof(histogram_g));
+  memset(histogram_r, 0, sizeof(histogram_r));
+  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+  align_buffer_page_end(argb_pixels, kSize * 4);
+  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+  uint16_t* orig_u = orig_y + kSize;
+  uint16_t* orig_v = orig_u + kSize / 2;
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_y[i] = i;
+  }
+  for (int i = 0; i < kSize / 2; ++i) {
+    orig_u[i] = 512;  // 512 is 0.
+    orig_v[i] = 512;
+  }
+
+  H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+  for (int i = 0; i < kSize; ++i) {
+    int b = argb_pixels[i * 4 + 0];
+    int g = argb_pixels[i * 4 + 1];
+    int r = argb_pixels[i * 4 + 2];
+    int a = argb_pixels[i * 4 + 3];
+    ++histogram_b[b];
+    ++histogram_g[g];
+    ++histogram_r[r];
+    int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
+    EXPECT_NEAR(b, expected_y, 1);
+    EXPECT_NEAR(g, expected_y, 1);
+    EXPECT_NEAR(r, expected_y, 1);
+    EXPECT_EQ(a, 255);
+  }
+
+  int count_b = 0;
+  int count_g = 0;
+  int count_r = 0;
+  for (int i = 0; i < kSize; ++i) {
+    if (histogram_b[i]) {
+      ++count_b;
+    }
+    if (histogram_g[i]) {
+      ++count_g;
+    }
+    if (histogram_r[i]) {
+      ++count_r;
+    }
+  }
+  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+  free_aligned_buffer_page_end(orig_yuv);
+  free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAR30) {
+  const int kSize = 1024;
+  int histogram_b[1024];
+  int histogram_g[1024];
+  int histogram_r[1024];
+  memset(histogram_b, 0, sizeof(histogram_b));
+  memset(histogram_g, 0, sizeof(histogram_g));
+  memset(histogram_r, 0, sizeof(histogram_r));
+
+  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+  align_buffer_page_end(ar30_pixels, kSize * 4);
+  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+  uint16_t* orig_u = orig_y + kSize;
+  uint16_t* orig_v = orig_u + kSize / 2;
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_y[i] = i;
+  }
+  for (int i = 0; i < kSize / 2; ++i) {
+    orig_u[i] = 512;  // 512 is 0.
+    orig_v[i] = 512;
+  }
+
+  H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+  for (int i = 0; i < kSize; ++i) {
+    int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+    int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+    int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+    int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+    ++histogram_b[b10];
+    ++histogram_g[g10];
+    ++histogram_r[r10];
+    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f + 0.5));
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
+  }
+
+  int count_b = 0;
+  int count_g = 0;
+  int count_r = 0;
+  for (int i = 0; i < kSize; ++i) {
+    if (histogram_b[i]) {
+      ++count_b;
+    }
+    if (histogram_g[i]) {
+      ++count_g;
+    }
+    if (histogram_r[i]) {
+      ++count_r;
+    }
+  }
+  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+  free_aligned_buffer_page_end(orig_yuv);
+  free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAB30) {
+  const int kSize = 1024;
+  int histogram_b[1024];
+  int histogram_g[1024];
+  int histogram_r[1024];
+  memset(histogram_b, 0, sizeof(histogram_b));
+  memset(histogram_g, 0, sizeof(histogram_g));
+  memset(histogram_r, 0, sizeof(histogram_r));
+
+  align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+  align_buffer_page_end(ab30_pixels, kSize * 4);
+  uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+  uint16_t* orig_u = orig_y + kSize;
+  uint16_t* orig_v = orig_u + kSize / 2;
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_y[i] = i;
+  }
+  for (int i = 0; i < kSize / 2; ++i) {
+    orig_u[i] = 512;  // 512 is 0.
+    orig_v[i] = 512;
+  }
+
+  H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1);
+
+  for (int i = 0; i < kSize; ++i) {
+    int r10 = reinterpret_cast<uint32_t*>(ab30_pixels)[i] & 1023;
+    int g10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 10) & 1023;
+    int b10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 20) & 1023;
+    int a2 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 30) & 3;
+    ++histogram_b[b10];
+    ++histogram_g[g10];
+    ++histogram_r[r10];
+    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
+  }
+
+  int count_b = 0;
+  int count_g = 0;
+  int count_r = 0;
+  for (int i = 0; i < kSize; ++i) {
+    if (histogram_b[i]) {
+      ++count_b;
+    }
+    if (histogram_g[i]) {
+      ++count_g;
+    }
+    if (histogram_r[i]) {
+      ++count_r;
+    }
+  }
+  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+  free_aligned_buffer_page_end(orig_yuv);
+  free_aligned_buffer_page_end(ab30_pixels);
+}
+
+// Test 8 bit YUV to 10 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToAR30) {
+  const int kSize = 256;
+  const int kHistSize = 1024;
+  int histogram_b[kHistSize];
+  int histogram_g[kHistSize];
+  int histogram_r[kHistSize];
+  memset(histogram_b, 0, sizeof(histogram_b));
+  memset(histogram_g, 0, sizeof(histogram_g));
+  memset(histogram_r, 0, sizeof(histogram_r));
+  align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+  align_buffer_page_end(ar30_pixels, kSize * 4);
+  uint8_t* orig_y = orig_yuv;
+  uint8_t* orig_u = orig_y + kSize;
+  uint8_t* orig_v = orig_u + kSize / 2;
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_y[i] = i;
+  }
+  for (int i = 0; i < kSize / 2; ++i) {
+    orig_u[i] = 128;  // 128 is 0.
+    orig_v[i] = 128;
+  }
+
+  H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+  for (int i = 0; i < kSize; ++i) {
+    int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+    int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+    int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+    int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+    ++histogram_b[b10];
+    ++histogram_g[g10];
+    ++histogram_r[r10];
+    int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
+  }
+
+  int count_b = 0;
+  int count_g = 0;
+  int count_r = 0;
+  for (int i = 0; i < kHistSize; ++i) {
+    if (histogram_b[i]) {
+      ++count_b;
+    }
+    if (histogram_g[i]) {
+      ++count_g;
+    }
+    if (histogram_r[i]) {
+      ++count_r;
+    }
+  }
+  printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+  free_aligned_buffer_page_end(orig_yuv);
+  free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test I400 with jpeg matrix is same as J400
+TEST_F(LibYUVConvertTest, TestI400) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_i400, kSize);
+  align_buffer_page_end(argb_pixels_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_j400, kSize * 4);
+  align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
+  align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
+
+  // Test grey scale
+  for (int i = 0; i < kSize; ++i) {
+    orig_i400[i] = i;
+  }
+
+  J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
+  I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
+                   kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
+                   kSize, 1);
+  I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
+                   kSize, 1);
+
+  EXPECT_EQ(0, argb_pixels_i400[0]);
+  EXPECT_EQ(0, argb_pixels_j400[0]);
+  EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+  EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+  EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+  EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+
+  for (int i = 0; i < kSize * 4; ++i) {
+    if ((i & 3) == 3) {
+      EXPECT_EQ(255, argb_pixels_j400[i]);
+    } else {
+      EXPECT_EQ(i / 4, argb_pixels_j400[i]);
+    }
+    EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_i400);
+  free_aligned_buffer_page_end(argb_pixels_i400);
+  free_aligned_buffer_page_end(argb_pixels_j400);
+  free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
+  free_aligned_buffer_page_end(argb_pixels_h709_i400);
+  free_aligned_buffer_page_end(argb_pixels_2020_i400);
+}
+
+// Test RGB24 to ARGB and back to RGB24
+TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_rgb24, kSize * 3);
+  align_buffer_page_end(argb_pixels, kSize * 4);
+  align_buffer_page_end(dest_rgb24, kSize * 3);
+
+  // Test grey scale
+  for (int i = 0; i < kSize * 3; ++i) {
+    orig_rgb24[i] = i;
+  }
+
+  RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
+  ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
+
+  for (int i = 0; i < kSize * 3; ++i) {
+    EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_rgb24);
+  free_aligned_buffer_page_end(argb_pixels);
+  free_aligned_buffer_page_end(dest_rgb24);
+}
+
+TEST_F(LibYUVConvertTest, Test565) {
+  SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8_t pixels565[256][2]);
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels[i][j] = i;
+    }
+  }
+  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+  uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+  EXPECT_EQ(610919429u, checksum);
+}
+#endif  // !defined(LEAN_TESTS)
+
+}  // namespace libyuv
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
new file mode 100644
index 00000000..f55bace3
--- /dev/null
+++ b/unit_test/convert_test.cc
@@ -0,0 +1,2110 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "../unit_test/unit_test.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+// Some functions fail on big endian. Enable these tests on all cpus except
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__)
+#define LITTLE_ENDIAN_ONLY_TEST 1
+#endif
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
+// subsample amount uses a divide.
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+
+// Planar test
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,      \
+                       SRC_DEPTH)                                             \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_u,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(src_v,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
+    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
+        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
+        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
+        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
+          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
+                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
+                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH)                  \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Unaligned, +, 2, SRC_DEPTH)                \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Invert, -, 0, SRC_DEPTH)                   \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
+                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
+                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
+  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
+                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
+
+// Test Android 420 to I420
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                        W1280, N, NEG, OFF, PN, OFF_U, OFF_V)                 \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) {      \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSizeUV =                                                       \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_uv,                                             \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    uint8_t* src_u = src_uv + OFF_U;                                          \
+    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
+    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
+        src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+        src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_u_c, 2,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_c, 3,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_u_opt, 102,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_opt, 103,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
+        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+        kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,               \
+        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                   \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE,        \
+          dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
+          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);      \
+    }                                                                         \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth; ++j) {                                      \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
+                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
+                       SUBSAMP_Y)                                              \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1,      \
+                  _Any, +, 0, PN, OFF_U, OFF_V)                                \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
+                  _Unaligned, +, 2, PN, OFF_U, OFF_V)                          \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+                  -, 0, PN, OFF_U, OFF_V)                                      \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V)
+#else
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
+                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
+                       SUBSAMP_Y)                                              \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V)
+#endif
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+#undef TESTAPLANARTOP
+#undef TESTAPLANARTOPI
+
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* /* src_u */,
+               int /* src_stride_u */,
+               const uint8_t* /* src_v */,
+               int /* src_stride_v */,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+                    dst_stride_vu, width, height);
+}
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
+                        DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
+                        SRC_DEPTH)                                            \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_u,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(src_v,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
+    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
+                                   src_v_p, kSrcHalfWidth,                    \
+                                   reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+                                   reinterpret_cast<DST_T*>(dst_uv_c),        \
+                                   kDstHalfWidth * 2, kWidth, NEG kHeight);   \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth,    \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) {  \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
+                  SRC_DEPTH)                                                  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#else
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)             \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+#endif
+
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
+TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
+
+#define TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                    DST_SUBSAMP_Y, W1280, N, NEG, OFF, DOY, SRC_DEPTH,        \
+                    TILE_WIDTH, TILE_HEIGHT)                                  \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
+    const int kPaddedHeight =                                                 \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+    align_buffer_page_end(                                                    \
+        src_uv,                                                               \
+        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                 \
+    for (int i = 0;                                                           \
+         i < kPaddedWidth * kPaddedHeight * SRC_BPC / (int)sizeof(SRC_T);     \
+         ++i) {                                                               \
+      src_y_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2 *      \
+                            SRC_BPC / (int)sizeof(SRC_T);                     \
+         ++i) {                                                               \
+      src_uv_p[i] =                                                           \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
+        2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
+        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
+        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
+          2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
+          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
+          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    if (DOY) {                                                                \
+      for (int i = 0; i < kHeight; ++i) {                                     \
+        for (int j = 0; j < kWidth; ++j) {                                    \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfHeight; ++i) {                                \
+      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                           \
+        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                        \
+                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                     \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                   DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_ + 1, _Any, +, 0, 1, SRC_DEPTH, TILE_WIDTH,    \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Unaligned, +, 2, 1, SRC_DEPTH, TILE_WIDTH,  \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Invert, -, 0, 1, SRC_DEPTH, TILE_WIDTH,     \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, TILE_WIDTH,        \
+              TILE_HEIGHT)                                                   \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH,      \
+              TILE_HEIGHT)
+#else
+#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,            \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
+                   DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)        \
+  TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+              FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,      \
+              benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH,      \
+              TILE_HEIGHT)
+#endif
+
+TESTBPTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBPTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBPTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBPTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
+TESTBPTOBP(MT2T, uint8_t, 10 / 8, 2, 2, P010, uint16_t, 2, 2, 2, 10, 16, 32)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+                       W1280, N, NEG, OFF)                                     \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    memset(dst_y_c, 1, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kStride; ++j)                                        \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
+                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
+                          kWidth, NEG kHeight);                                \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
+                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+                            kStrideUV * 2, kWidth, NEG kHeight);               \
+    }                                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
+      for (int j = 0; j < kStrideUV; ++j) {                                    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_argb);                                    \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Unaligned, +, 2)                          \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Invert, -, 0)                             \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                 benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1)
+TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2)
+TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1)
+#ifdef LITTLE_ENDIAN_ONLY_TEST
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
+
+#define TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X,           \
+                        SUBSAMP_Y, W1280, N, NEG, OFF)                         \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    memset(dst_a_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 2, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kStride; ++j)                                        \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);               \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c,  \
+                          kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2,  \
+                          dst_a_c, kWidth, kWidth, NEG kHeight);               \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,        \
+                            dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \
+                            kStrideUV * 2, dst_a_opt, kWidth, kWidth,          \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]);         \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
+      for (int j = 0; j < kStrideUV; ++j) {                                    \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_a_c);                                     \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_a_opt);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_argb);                                    \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_ + 1, _Any, +, 0)                            \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Unaligned, +, 2)                          \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Invert, -, 0)                             \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                  benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
+
+#define TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   W1280, N, NEG, OFF)                                        \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kStride; ++j)                                       \
+        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
+                          kStrideUV * 2, kWidth, NEG kHeight);                \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth,       \
+                            dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);  \
+    }                                                                         \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth; ++j) {                                      \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
+                  dst_uv_opt[i * kStrideUV * 2 + j]);                         \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_argb);                                   \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_ + 1, _Any, +, 0)                           \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Unaligned, +, 2)                         \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Invert, -, 0)                            \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0)
+#else
+#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+  TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+             benchmark_width_, _Opt, +, 0)
+#endif
+
+TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
+TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
+TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBP(AYUV, 1, 4, NV21, 2, 2)
+
+#if !defined(LEAN_TESTS)
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // Test special value that matches marker start.
+  memset(orig_pixels, 0xff, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  const int kMultiple = 10;
+  const int kBufSize = kImageSize * kMultiple + kOff;
+  align_buffer_page_end(orig_pixels, kBufSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kBufSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
+                             ? benchmark_width_ * benchmark_height_
+                             : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // NULL pointer. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+  // Negative size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+  // Too large size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // SOI but no EOI. Expect fail.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  }
+
+  // EOI but no SOI. Expect fail.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 0;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+  // SOI but no EOI. Expect fail.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    const int kSize = fastrand() % 5000 + 3;
+    align_buffer_page_end(orig_pixels, kSize);
+    MemRandomize(orig_pixels, kSize);
+
+    // Add SOI so frame will be scanned.
+    orig_pixels[0] = 0xff;
+    orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[2] = 0xff;
+    orig_pixels[kSize - 1] = 0xff;
+    ValidateJpeg(orig_pixels,
+                 kSize);  // Failure normally expected.
+    free_aligned_buffer_page_end(orig_pixels);
+  }
+}
+
+// Test data created in GIMP.  In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+    0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+    0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest0JpgLen = 421;
+
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+    0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+    0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+    0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+    0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+    0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+    0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+    0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+    0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+    0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+    0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+    0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+    0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+    0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+    0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+    0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+    0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+    0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+    0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+    0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+    0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+    0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+    0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+    0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+    0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+    0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+    0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+    0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+    0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+    0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
+
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+    0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+    0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+    0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+    0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+    0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+    0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+    0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+    0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+    0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+    0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+    0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+    0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+    0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+    0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+    0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+    0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+    0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+    0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+    0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+    0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+    0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+    0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+    0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+    0xd9};
+static const size_t kTest2JpgLen = 685;
+
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+    0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+    0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+    0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+    0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+    0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+    0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+    0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+    0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+    0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+    0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+    0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+    0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+    0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+    0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+    0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+    0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+    0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+    0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+    0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+    0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+    0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+    0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+    0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+    0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+    0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+    0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+    0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+    0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+    0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+    0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+    0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+    0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+    0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+    0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+    0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+    0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+    0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+    0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+    0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+    0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+    0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+    0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+    0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+    0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+    0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+    0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+    0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+    0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+    0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+    0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+    0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+    0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+    0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+    0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+    0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+    0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+    0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+    0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+    0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  printf("test jpeg size %d x %d\n", width, height);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_u, half_width * half_height);
+  align_buffer_page_end(dst_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+                     dst_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+  uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_u_hash, 2501859930u);
+  EXPECT_EQ(dst_v_hash, 2126459123u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  // Convert to NV21
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV21
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+  I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_vu, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_vu);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  // Convert to NV12
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert to I420
+  align_buffer_page_end(dst2_y, width * height);
+  align_buffer_page_end(dst2_u, half_width * half_height);
+  align_buffer_page_end(dst2_v, half_width * half_height);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+                     dst2_v, half_width, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Convert I420 to NV12
+  align_buffer_page_end(dst3_y, width * height);
+  align_buffer_page_end(dst3_uv, half_width * half_height * 2);
+
+  I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+             width, dst3_uv, half_width * 2, width, height);
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
+  }
+  for (int i = 0; i < half_width * half_height * 2; ++i) {
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+  }
+
+  free_aligned_buffer_page_end(dst3_y);
+  free_aligned_buffer_page_end(dst3_uv);
+
+  free_aligned_buffer_page_end(dst2_y);
+  free_aligned_buffer_page_end(dst2_u);
+  free_aligned_buffer_page_end(dst2_v);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 1069662856u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+// TODO(fbarchard): Improve test to compare against I422, not checksum
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 493520167u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 493520167u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_uv_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_vu_hash, 135214341u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int half_width = (width + 1) / 2;
+  int half_height = (height + 1) / 2;
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_uv, half_width * half_height * 2);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+                     half_width * 2, width, height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value. Hashes are for VU so flip the plane.
+  uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+  align_buffer_page_end(dst_vu, half_width * half_height * 2);
+  SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
+              half_height);
+  uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 506143297u);
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+  int width = 0;
+  int height = 0;
+  int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+  EXPECT_EQ(0, ret);
+
+  int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+                             benchmark_height_ / (width * height);
+  if (benchmark_iterations < 1) {
+    benchmark_iterations = 1;
+  }
+
+  align_buffer_page_end(dst_argb, width * height * 4);
+  for (int times = 0; times < benchmark_iterations; ++times) {
+    ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+                     height, width, height);
+  }
+  // Expect sucesss
+  EXPECT_EQ(0, ret);
+
+  // Test result matches known hash value.
+  uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+#ifdef LIBYUV_UNLIMITED_DATA
+  EXPECT_EQ(dst_argb_hash, 3900633302u);
+#else
+  EXPECT_EQ(dst_argb_hash, 2355976473u);
+#endif
+
+  free_aligned_buffer_page_end(dst_argb);
+}
+
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+  int width = mjpeg_decoder.GetWidth();
+  int height = mjpeg_decoder.GetHeight();
+
+  // YUV420
+  if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+      mjpeg_decoder.GetNumComponents() == 3 &&
+      mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+      mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+      mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+      mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+      mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV422
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV444
+  } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+             mjpeg_decoder.GetNumComponents() == 3 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+             mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+    printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+    // YUV400
+  } else if (mjpeg_decoder.GetColorSpace() ==
+                 MJpegDecoder::kColorSpaceGrayscale &&
+             mjpeg_decoder.GetNumComponents() == 1 &&
+             mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+             mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+    printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+           static_cast<int>(sample_size));
+  } else {
+    // Unknown colorspace.
+    printf("JPeg is Unknown colorspace.\n");
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+                            kTest4JpgLen));  // Valid but unsupported.
+}
+#endif  // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y =
+      ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size =
+      kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+  align_buffer_page_end(src_y, sample_size);
+  uint8_t* src_uv = src_y + kWidth * kHeight;
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) {
+    src_uv[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_y_2, 1, kDestWidth * kDestHeight);
+  memset(dst_u_2, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v_2, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2,
+                SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2,
+                SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+                kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+  NV12ToI420(src_y + crop_y * kWidth, kWidth,
+             src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y,
+             kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+             SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight);
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(dst_y_2);
+  free_aligned_buffer_page_end(dst_u_2);
+  free_aligned_buffer_page_end(dst_v_2);
+  free_aligned_buffer_page_end(src_y);
+}
+
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y = benchmark_height_ > 1 ? 1 : 0;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size = kWidth * kHeight +
+                          kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+                          kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+  align_buffer_page_end(src_y, sample_size);
+  uint8_t* src_u = src_y + kWidth * kHeight;
+  uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+  align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+  align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+    src_u[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+    src_v[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3,
+         SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+                  kDestWidth, kDestHeight, libyuv::kRotate0,
+                  libyuv::FOURCC_I420);
+  }
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+                dst_y[i * kDestWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+                dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+                dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(src_y);
+}
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
+  TEST_F(LibYUVConvertTest, NAME) {                                           \
+    const int kWidth = benchmark_width_;                                      \
+    const int kHeight = benchmark_height_;                                    \
+                                                                              \
+    align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
+    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
+    align_buffer_page_end(orig_u,                                             \
+                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
+    align_buffer_page_end(orig_v,                                             \
+                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
+                                                                              \
+    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
+    align_buffer_page_end(dst_uv_orig,                                        \
+                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
+                                                                              \
+    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
+    align_buffer_page_end(dst_uv,                                             \
+                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
+                                                                              \
+    MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);              \
+                                                                              \
+    /* Convert UYVY to NV12 in 2 steps for reference */                       \
+    libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth,   \
+                       orig_u, SUBSAMPLE(kWidth, 2), orig_v,                  \
+                       SUBSAMPLE(kWidth, 2), kWidth, kHeight);                \
+    libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v,  \
+                       SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \
+                       2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);            \
+                                                                              \
+    /* Convert to NV12 */                                                     \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth,  \
+                         dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);  \
+    }                                                                         \
+                                                                              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      EXPECT_EQ(orig_y[i], dst_y[i]);                                         \
+    }                                                                         \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                     \
+    }                                                                         \
+    for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2);     \
+         ++i) {                                                               \
+      EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
+    }                                                                         \
+                                                                              \
+    free_aligned_buffer_page_end(orig_uyvy);                                  \
+    free_aligned_buffer_page_end(orig_y);                                     \
+    free_aligned_buffer_page_end(orig_u);                                     \
+    free_aligned_buffer_page_end(orig_v);                                     \
+    free_aligned_buffer_page_end(dst_y_orig);                                 \
+    free_aligned_buffer_page_end(dst_uv_orig);                                \
+    free_aligned_buffer_page_end(dst_y);                                      \
+    free_aligned_buffer_page_end(dst_uv);                                     \
+  }
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+TEST_F(LibYUVConvertTest, MM21ToYUY2) {
+  const int kWidth = (benchmark_width_ + 15) & (~15);
+  const int kHeight = (benchmark_height_ + 31) & (~31);
+
+  align_buffer_page_end(orig_y, kWidth * kHeight);
+  align_buffer_page_end(orig_uv,
+                        2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(tmp_y, kWidth * kHeight);
+  align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+  align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+  align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight);
+
+  MemRandomize(orig_y, kWidth * kHeight);
+  MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));
+
+  /* Convert MM21 to YUY2 in 2 steps for reference */
+  libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y,
+                     kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v,
+                     SUBSAMPLE(kWidth, 2), golden_yuyv,
+                     4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+
+  /* Convert to NV12 */
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2),
+                       dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight);
+  }
+
+  for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) {
+    EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(tmp_y);
+  free_aligned_buffer_page_end(tmp_u);
+  free_aligned_buffer_page_end(tmp_v);
+  free_aligned_buffer_page_end(dst_yuyv);
+  free_aligned_buffer_page_end(golden_yuyv);
+}
+
+// Test RGB24 to J420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
+  align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2);
+  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+                      (kSize * 2) * benchmark_iterations_;
+
+  for (int i = 0; i < kSize * 3 * 2; ++i) {
+    orig_rgb24[i] = i;
+  }
+
+  for (int i = 0; i < iterations256; ++i) {
+    RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize,  // Y plane
+                dest_j420 + kSize * 2, kSize / 2,         // U plane
+                dest_j420 + kSize * 5 / 2, kSize / 2,     // V plane
+                kSize, 2);
+  }
+
+  uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
+  EXPECT_EQ(2755440272u, checksum);
+
+  free_aligned_buffer_page_end(orig_rgb24);
+  free_aligned_buffer_page_end(dest_j420);
+}
+#endif
+
+// Test RGB24 to I420 is exact
+#if defined(LIBYUV_BIT_EXACT)
+TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
+  const int kSize = 256;
+  align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
+  align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2);
+  int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
+                      (kSize * 2) * benchmark_iterations_;
+
+  for (int i = 0; i < kSize * 3 * 2; ++i) {
+    orig_rgb24[i] = i;
+  }
+
+  for (int i = 0; i < iterations256; ++i) {
+    RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize,  // Y plane
+                dest_i420 + kSize * 2, kSize / 2,         // U plane
+                dest_i420 + kSize * 5 / 2, kSize / 2,     // V plane
+                kSize, 2);
+  }
+
+  uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
+  EXPECT_EQ(1526656597u, checksum);
+
+  free_aligned_buffer_page_end(orig_rgb24);
+  free_aligned_buffer_page_end(dest_i420);
+}
+#endif
+
+#endif  // !defined(LEAN_TESTS)
+
+}  // namespace libyuv
diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
new file mode 100644
index 00000000..437b6632
--- /dev/null
+++ b/unit_test/cpu_test.cc
@@ -0,0 +1,342 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestCpuHas) {
+  int cpu_flags = TestCpuFlag(-1);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
+#if defined(__arm__) || defined(__aarch64__)
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  printf("Has ARM 0x%x\n", has_arm);
+  int has_neon = TestCpuFlag(kCpuHasNEON);
+  printf("Has NEON 0x%x\n", has_neon);
+#endif
+#if defined(__riscv) && defined(__linux__)
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  printf("Has RISCV 0x%x\n", has_riscv);
+  int has_rvv = TestCpuFlag(kCpuHasRVV);
+  printf("Has RVV 0x%x\n", has_rvv);
+  int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
+  printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
+#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+  int has_avx = TestCpuFlag(kCpuHasAVX);
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_erms = TestCpuFlag(kCpuHasERMS);
+  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+  int has_f16c = TestCpuFlag(kCpuHasF16C);
+  int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+  int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+  int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
+  int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+  int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+  int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+  int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+  int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+  int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+  printf("Has X86 0x%x\n", has_x86);
+  printf("Has SSE2 0x%x\n", has_sse2);
+  printf("Has SSSE3 0x%x\n", has_ssse3);
+  printf("Has SSE41 0x%x\n", has_sse41);
+  printf("Has SSE42 0x%x\n", has_sse42);
+  printf("Has AVX 0x%x\n", has_avx);
+  printf("Has AVX2 0x%x\n", has_avx2);
+  printf("Has ERMS 0x%x\n", has_erms);
+  printf("Has FMA3 0x%x\n", has_fma3);
+  printf("Has F16C 0x%x\n", has_f16c);
+  printf("Has AVX512BW 0x%x\n", has_avx512bw);
+  printf("Has AVX512VL 0x%x\n", has_avx512vl);
+  printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+  printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+  printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+  printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+  printf("Has AVX10 0x%x\n", has_avx10);
+  printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+  printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
+#endif
+#if defined(__mips__)
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  printf("Has MIPS 0x%x\n", has_mips);
+  int has_msa = TestCpuFlag(kCpuHasMSA);
+  printf("Has MSA 0x%x\n", has_msa);
+#endif
+#if defined(__loongarch__)
+  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
+  printf("Has LOONGARCH 0x%x\n", has_loongarch);
+  int has_lsx = TestCpuFlag(kCpuHasLSX);
+  printf("Has LSX 0x%x\n", has_lsx);
+  int has_lasx = TestCpuFlag(kCpuHasLASX);
+  printf("Has LASX 0x%x\n", has_lasx);
+#endif
+}
+
+TEST_F(LibYUVBaseTest, TestCompilerMacros) {
+  // Tests all macros used in public headers.
+#ifdef __ATOMIC_RELAXED
+  printf("__ATOMIC_RELAXED %d\n", __ATOMIC_RELAXED);
+#endif
+#ifdef __cplusplus
+  printf("__cplusplus %ld\n", __cplusplus);
+#endif
+#ifdef __clang_major__
+  printf("__clang_major__ %d\n", __clang_major__);
+#endif
+#ifdef __clang_minor__
+  printf("__clang_minor__ %d\n", __clang_minor__);
+#endif
+#ifdef __GNUC__
+  printf("__GNUC__ %d\n", __GNUC__);
+#endif
+#ifdef __GNUC_MINOR__
+  printf("__GNUC_MINOR__ %d\n", __GNUC_MINOR__);
+#endif
+#ifdef __i386__
+  printf("__i386__ %d\n", __i386__);
+#endif
+#ifdef __x86_64__
+  printf("__x86_64__ %d\n", __x86_64__);
+#endif
+#ifdef _M_IX86
+  printf("_M_IX86 %d\n", _M_IX86);
+#endif
+#ifdef _M_X64
+  printf("_M_X64 %d\n", _M_X64);
+#endif
+#ifdef _MSC_VER
+  printf("_MSC_VER %d\n", _MSC_VER);
+#endif
+#ifdef __aarch64__
+  printf("__aarch64__ %d\n", __aarch64__);
+#endif
+#ifdef __arm__
+  printf("__arm__ %d\n", __arm__);
+#endif
+#ifdef __riscv
+  printf("__riscv %d\n", __riscv);
+#endif
+#ifdef __riscv_vector
+  printf("__riscv_vector %d\n", __riscv_vector);
+#endif
+#ifdef __riscv_v_intrinsic
+  printf("__riscv_v_intrinsic %d\n", __riscv_v_intrinsic);
+#endif
+#ifdef __APPLE__
+  printf("__APPLE__ %d\n", __APPLE__);
+#endif
+#ifdef __clang__
+  printf("__clang__ %d\n", __clang__);
+#endif
+#ifdef __CLR_VER
+  printf("__CLR_VER %d\n", __CLR_VER);
+#endif
+#ifdef __CYGWIN__
+  printf("__CYGWIN__ %d\n", __CYGWIN__);
+#endif
+#ifdef __llvm__
+  printf("__llvm__ %d\n", __llvm__);
+#endif
+#ifdef __mips_msa
+  printf("__mips_msa %d\n", __mips_msa);
+#endif
+#ifdef __mips
+  printf("__mips %d\n", __mips);
+#endif
+#ifdef __mips_isa_rev
+  printf("__mips_isa_rev %d\n", __mips_isa_rev);
+#endif
+#ifdef _MIPS_ARCH_LOONGSON3A
+  printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
+#endif
+#ifdef __loongarch__
+  printf("__loongarch__ %d\n", __loongarch__);
+#endif
+#ifdef _WIN32
+  printf("_WIN32 %d\n", _WIN32);
+#endif
+#ifdef __native_client__
+  printf("__native_client__ %d\n", __native_client__);
+#endif
+#ifdef __pic__
+  printf("__pic__ %d\n", __pic__);
+#endif
+#ifdef __pnacl__
+  printf("__pnacl__ %d\n", __pnacl__);
+#endif
+#ifdef GG_LONGLONG
+  printf("GG_LONGLONG %lld\n", GG_LONGLONG(1));
+#endif
+#ifdef INT_TYPES_DEFINED
+  printf("INT_TYPES_DEFINED\n");
+#endif
+#ifdef __has_feature
+  printf("__has_feature\n");
+#if __has_feature(memory_sanitizer)
+  printf("__has_feature(memory_sanitizer) %d\n",
+         __has_feature(memory_sanitizer));
+#endif
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+TEST_F(LibYUVBaseTest, TestCpuId) {
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  if (has_x86) {
+    int cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, cpu_info);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
+           reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
+           cpu_info[2]);
+    EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, cpu_info);
+    int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
+           model);
+  }
+}
+#endif
+
+static int FileExists(const char* file_name) {
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    return 0;
+  }
+  fclose(f);
+  return 1;
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxNeon) {
+  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+
+    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
+  } else {
+    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+  }
+#if defined(__linux__) && defined(__ARM_NEON__)
+  if (FileExists("/proc/cpuinfo")) {
+    if (kCpuHasNEON != ArmCpuCaps("/proc/cpuinfo")) {
+      // This can happen on ARM emulator but /proc/cpuinfo is from host.
+      printf("WARNING: Neon build enabled but CPU does not have NEON\n");
+    }
+  } else {
+    printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+  }
+#endif
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxMipsMsa) {
+  if (FileExists("../../unit_test/testdata/mips.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n");
+
+    EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt"));
+    EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt"));
+    EXPECT_EQ(kCpuHasMSA,
+              MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt"));
+  } else {
+    printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n");
+  }
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxRVV) {
+  if (FileExists("../../unit_test/testdata/riscv64.txt")) {
+    printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");
+
+    EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+    EXPECT_EQ(kCpuHasRVV,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
+    EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
+  } else {
+    printf(
+        "WARNING: unable to load "
+        "\"../../unit_test/testdata/riscv64.txt\"\n");
+  }
+#if defined(__linux__) && defined(__riscv)
+  if (FileExists("/proc/cpuinfo")) {
+    if (!(kCpuHasRVV & RiscvCpuCaps("/proc/cpuinfo"))) {
+      // This can happen on RVV emulator but /proc/cpuinfo is from host.
+      printf("WARNING: RVV build enabled but CPU does not have RVV\n");
+    }
+  } else {
+    printf("WARNING: unable to load \"/proc/cpuinfo\"\n");
+  }
+#endif
+}
+
+// TODO(fbarchard): Fix clangcl test of cpuflags.
+#ifdef _MSC_VER
+TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) {
+#else
+TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
+#endif
+  // Reset any masked flags that may have been set so auto init is enabled.
+  MaskCpuFlags(0);
+
+  int original_cpu_flags = TestCpuFlag(-1);
+
+  // Test setting different CPU configurations.
+  int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
+  SetCpuFlags(cpu_flags);
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+  cpu_flags = kCpuHasX86 | kCpuInitialized;
+  SetCpuFlags(cpu_flags);
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+  // Test that setting 0 turns auto-init back on.
+  SetCpuFlags(0);
+  EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
+
+  // Restore the CPU flag mask.
+  MaskCpuFlags(benchmark_cpu_info_);
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/cpu_thread_test.cc b/unit_test/cpu_thread_test.cc
index 59061b98..69aab74e 100644
--- a/files/unit_test/cpu_thread_test.cc
+++ b/unit_test/cpu_thread_test.cc
@@ -12,7 +12,7 @@
 
 #include "libyuv/cpu_id.h"
 
-#if defined(__clang__)
+#if defined(__clang__) && !defined(__wasm__)
 #if __has_include(<pthread.h>)
 #define LIBYUV_HAVE_PTHREAD 1
 #endif
@@ -30,7 +30,7 @@ namespace libyuv {
 void* ThreadMain(void* arg) {
   int* flags = static_cast<int*>(arg);
 
-  *flags = TestCpuFlag(kCpuHasSSSE3);
+  *flags = TestCpuFlag(kCpuInitialized);
   return nullptr;
 }
 #endif  // LIBYUV_HAVE_PTHREAD
diff --git a/files/unit_test/math_test.cc b/unit_test/math_test.cc
index 0abbad51..a1544c12 100644
--- a/files/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
@@ -16,10 +16,14 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"
+#endif
 
 namespace libyuv {
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVBaseTest, TestFixedDiv) {
   int num[1280];
   int div[1280];
@@ -151,5 +155,6 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
     EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
+#endif  // ENABLE_ROW_TESTS
 
 }  // namespace libyuv
diff --git a/files/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 70f8966e..ec1d72eb 100644
--- a/files/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -12,9 +12,6 @@
 #include <stdlib.h>
 #include <time.h>
 
-// row.h defines SIMD_ALIGNED, overriding unit_test.h
-#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
-
 #include "../unit_test/unit_test.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -24,6 +21,19 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+// TODO(fbarchard): Remove row.h from unittests.  Test public functions.
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+#endif
+
+#if defined(LIBYUV_BIT_EXACT)
+#define EXPECTED_UNATTENUATE_DIFF 0
+#else
+#define EXPECTED_UNATTENUATE_DIFF 2
+#endif
 
 namespace libyuv {
 
@@ -47,12 +57,17 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   orig_pixels[2 * 4 + 0] = 16u;
   orig_pixels[2 * 4 + 1] = 64u;
   orig_pixels[2 * 4 + 2] = 192u;
-  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[2 * 4 + 3] = 128u;
   orig_pixels[3 * 4 + 0] = 16u;
   orig_pixels[3 * 4 + 1] = 64u;
   orig_pixels[3 * 4 + 2] = 192u;
-  orig_pixels[3 * 4 + 3] = 128u;
-  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  orig_pixels[3 * 4 + 3] = 255u;
+  orig_pixels[4 * 4 + 0] = 255u;
+  orig_pixels[4 * 4 + 1] = 255u;
+  orig_pixels[4 * 4 + 2] = 255u;
+  orig_pixels[4 * 4 + 3] = 255u;
+
+  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
   EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
   EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
   EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
@@ -61,14 +76,55 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
   EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
-  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
-  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
-  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
-  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
-  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
-  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
-  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
+
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
+  EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
+
+  // test 255
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 0;
+    orig_pixels[i * 4 + 2] = 0;
+    orig_pixels[i * 4 + 3] = 255;
+  }
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
+    EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
+  }
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i * 4 + 0] = i;
@@ -82,10 +138,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
     ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
   }
   for (int i = 0; i < 1280; ++i) {
-    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
-    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
   }
   // Make sure transparent, 50% and opaque are fully accurate.
   EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
@@ -96,9 +152,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
   EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
   EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
-  EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
-  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
-  EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
+  EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
+  EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
   EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
 
   free_aligned_buffer_page_end(atten2_pixels);
@@ -151,31 +207,32 @@ static int TestAttenuateI(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
-  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_,
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, 2);
+
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_EQ(max_diff, 0);
 }
 
 static int TestUnattenuateI(int width,
@@ -224,31 +281,31 @@ static int TestUnattenuateI(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
-  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 1);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, -1, 0);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
@@ -277,6 +334,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
   }
 }
 
+// near is for legacy platforms.
 TEST_F(LibYUVPlanarTest, TestARGBGray) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   memset(orig_pixels, 0, sizeof(orig_pixels));
@@ -313,17 +371,17 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
-  EXPECT_EQ(30u, orig_pixels[0][0]);
-  EXPECT_EQ(30u, orig_pixels[0][1]);
-  EXPECT_EQ(30u, orig_pixels[0][2]);
+  EXPECT_NEAR(29u, orig_pixels[0][0], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][1], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][2], 1);
   EXPECT_EQ(128u, orig_pixels[0][3]);
   EXPECT_EQ(149u, orig_pixels[1][0]);
   EXPECT_EQ(149u, orig_pixels[1][1]);
   EXPECT_EQ(149u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(76u, orig_pixels[2][0]);
-  EXPECT_EQ(76u, orig_pixels[2][1]);
-  EXPECT_EQ(76u, orig_pixels[2][2]);
+  EXPECT_NEAR(77u, orig_pixels[2][0], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][1], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][2], 1);
   EXPECT_EQ(255u, orig_pixels[2][3]);
   EXPECT_EQ(0u, orig_pixels[3][0]);
   EXPECT_EQ(0u, orig_pixels[3][1]);
@@ -333,9 +391,9 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
   EXPECT_EQ(255u, orig_pixels[4][1]);
   EXPECT_EQ(255u, orig_pixels[4][2]);
   EXPECT_EQ(255u, orig_pixels[4][3]);
-  EXPECT_EQ(96u, orig_pixels[5][0]);
-  EXPECT_EQ(96u, orig_pixels[5][1]);
-  EXPECT_EQ(96u, orig_pixels[5][2]);
+  EXPECT_NEAR(97u, orig_pixels[5][0], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][1], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][2], 1);
   EXPECT_EQ(224u, orig_pixels[5][3]);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -385,30 +443,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
-  EXPECT_EQ(30u, gray_pixels[0][0]);
-  EXPECT_EQ(30u, gray_pixels[0][1]);
-  EXPECT_EQ(30u, gray_pixels[0][2]);
-  EXPECT_EQ(128u, gray_pixels[0][3]);
-  EXPECT_EQ(149u, gray_pixels[1][0]);
-  EXPECT_EQ(149u, gray_pixels[1][1]);
-  EXPECT_EQ(149u, gray_pixels[1][2]);
-  EXPECT_EQ(0u, gray_pixels[1][3]);
-  EXPECT_EQ(76u, gray_pixels[2][0]);
-  EXPECT_EQ(76u, gray_pixels[2][1]);
-  EXPECT_EQ(76u, gray_pixels[2][2]);
-  EXPECT_EQ(255u, gray_pixels[2][3]);
-  EXPECT_EQ(0u, gray_pixels[3][0]);
-  EXPECT_EQ(0u, gray_pixels[3][1]);
-  EXPECT_EQ(0u, gray_pixels[3][2]);
-  EXPECT_EQ(255u, gray_pixels[3][3]);
-  EXPECT_EQ(255u, gray_pixels[4][0]);
-  EXPECT_EQ(255u, gray_pixels[4][1]);
-  EXPECT_EQ(255u, gray_pixels[4][2]);
-  EXPECT_EQ(255u, gray_pixels[4][3]);
-  EXPECT_EQ(96u, gray_pixels[5][0]);
-  EXPECT_EQ(96u, gray_pixels[5][1]);
-  EXPECT_EQ(96u, gray_pixels[5][2]);
-  EXPECT_EQ(224u, gray_pixels[5][3]);
+  EXPECT_NEAR(30u, gray_pixels[0][0], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][1], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][2], 1);
+  EXPECT_NEAR(128u, gray_pixels[0][3], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][0], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][1], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][2], 1);
+  EXPECT_NEAR(0u, gray_pixels[1][3], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][0], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][1], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[2][3], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][0], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][1], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[3][3], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][0], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][1], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][3], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][0], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][1], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][2], 1);
+  EXPECT_NEAR(224u, gray_pixels[5][3], 1);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
@@ -418,6 +476,20 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
   }
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i;
+    orig_pixels[i][2] = i;
+    orig_pixels[i][3] = i;
+  }
+  ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_EQ(i, orig_pixels[i][0]);
+    EXPECT_EQ(i, orig_pixels[i][1]);
+    EXPECT_EQ(i, orig_pixels[i][2]);
+    EXPECT_EQ(i, orig_pixels[i][3]);
+  }
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBSepia) {
@@ -763,27 +835,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
   }
 }
 
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
 
-  for (int i = 0; i < 1280; ++i) {
-    orig_pixels[i][0] = i;
-    orig_pixels[i][1] = i / 2;
-    orig_pixels[i][2] = i / 3;
-    orig_pixels[i][3] = i / 4;
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+             benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
-  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
 
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
-    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
-    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
-    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
+
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+              benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+                benchmark_width_, benchmark_height_);
   }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                  benchmark_width_ * 2, benchmark_width_, benchmark_height_);
   }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
 }
 
 TEST_F(LibYUVPlanarTest, TestShade) {
@@ -1006,10 +1126,91 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
   }
 }
 
+TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
+  SIMD_ALIGNED(uint16_t orig_pixels_0[1280]);
+  SIMD_ALIGNED(uint16_t orig_pixels_1[1280]);
+  SIMD_ALIGNED(uint16_t interpolate_pixels[1280]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+  orig_pixels_0[0] = 16u;
+  orig_pixels_0[1] = 32u;
+  orig_pixels_0[2] = 64u;
+  orig_pixels_0[3] = 128u;
+  orig_pixels_0[4] = 0u;
+  orig_pixels_0[5] = 0u;
+  orig_pixels_0[6] = 0u;
+  orig_pixels_0[7] = 255u;
+  orig_pixels_0[8] = 0u;
+  orig_pixels_0[9] = 0u;
+  orig_pixels_0[10] = 0u;
+  orig_pixels_0[11] = 0u;
+  orig_pixels_0[12] = 0u;
+  orig_pixels_0[13] = 0u;
+  orig_pixels_0[14] = 0u;
+  orig_pixels_0[15] = 0u;
+
+  orig_pixels_1[0] = 0u;
+  orig_pixels_1[1] = 0u;
+  orig_pixels_1[2] = 0u;
+  orig_pixels_1[3] = 0u;
+  orig_pixels_1[4] = 0u;
+  orig_pixels_1[5] = 0u;
+  orig_pixels_1[6] = 0u;
+  orig_pixels_1[7] = 0u;
+  orig_pixels_1[8] = 0u;
+  orig_pixels_1[9] = 0u;
+  orig_pixels_1[10] = 0u;
+  orig_pixels_1[11] = 0u;
+  orig_pixels_1[12] = 255u;
+  orig_pixels_1[13] = 255u;
+  orig_pixels_1[14] = 255u;
+  orig_pixels_1[15] = 255u;
+
+  InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                      &interpolate_pixels[0], 0, 16, 1, 128);
+  EXPECT_EQ(8u, interpolate_pixels[0]);
+  EXPECT_EQ(16u, interpolate_pixels[1]);
+  EXPECT_EQ(32u, interpolate_pixels[2]);
+  EXPECT_EQ(64u, interpolate_pixels[3]);
+  EXPECT_EQ(0u, interpolate_pixels[4]);
+  EXPECT_EQ(0u, interpolate_pixels[5]);
+  EXPECT_EQ(0u, interpolate_pixels[6]);
+  EXPECT_EQ(128u, interpolate_pixels[7]);
+  EXPECT_EQ(0u, interpolate_pixels[8]);
+  EXPECT_EQ(0u, interpolate_pixels[9]);
+  EXPECT_EQ(0u, interpolate_pixels[10]);
+  EXPECT_EQ(0u, interpolate_pixels[11]);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
+
+  InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                      &interpolate_pixels[0], 0, 16, 1, 0);
+  EXPECT_EQ(16u, interpolate_pixels[0]);
+  EXPECT_EQ(32u, interpolate_pixels[1]);
+  EXPECT_EQ(64u, interpolate_pixels[2]);
+  EXPECT_EQ(128u, interpolate_pixels[3]);
+
+  InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                      &interpolate_pixels[0], 0, 16, 1, 192);
+
+  EXPECT_EQ(4u, interpolate_pixels[0]);
+  EXPECT_EQ(8u, interpolate_pixels[1]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
+  EXPECT_EQ(32u, interpolate_pixels[3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                        &interpolate_pixels[0], 0, 1280, 1, 123);
+  }
+}
+
 #define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \
                  N, NEG, OFF)                                                 \
   TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kWidth = W1280;                                                 \
     const int kHeight = benchmark_height_;                                    \
     const int kStrideA =                                                      \
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
@@ -1041,7 +1242,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
   }
 
 #define TESTINTERPOLATE(TERP)                                                \
-  TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0)   \
+  TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ + 1, TERP, _Any, +, 0)   \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0)    \
   TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
@@ -1058,7 +1259,8 @@ static int TestBlend(int width,
                      int disable_cpu_flags,
                      int benchmark_cpu_info,
                      int invert,
-                     int off) {
+                     int off,
+                     int attenuate) {
   if (width < 1) {
     width = 1;
   }
@@ -1072,10 +1274,12 @@ static int TestBlend(int width,
     src_argb_a[i + off] = (fastrand() & 0xff);
     src_argb_b[i + off] = (fastrand() & 0xff);
   }
-  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
-                height);
-  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
-                height);
+  MemRandomize(src_argb_a, kStride * height + off);
+  MemRandomize(src_argb_b, kStride * height + off);
+  if (attenuate) {
+    ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+                  height);
+  }
   memset(dst_argb_c, 255, kStride * height);
   memset(dst_argb_opt, 255, kStride * height);
 
@@ -1104,29 +1308,36 @@ static int TestBlend(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
   int max_diff =
-      TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+      TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
+  int max_diff =
+      TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
   EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
   EXPECT_LE(max_diff, 1);
 }
 
@@ -1203,7 +1414,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
                  disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
 }
 TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
-  TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+  TestBlendPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                  disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
 }
 TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
@@ -1298,7 +1509,7 @@ TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
 
 // TODO(fbarchard): DISABLED because _Any uses C.  Avoid C and re-enable.
 TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
-  TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+  TestI420Blend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
 }
 TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
@@ -1400,6 +1611,251 @@ TEST_F(LibYUVPlanarTest, TestCopyPlane) {
   EXPECT_EQ(0, err);
 }
 
+TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
+  int i;
+  int y_plane_size = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_y, y_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  MemRandomize(orig_y, y_plane_size);
+  memset(dst_c, 1, y_plane_size);
+  memset(dst_opt, 2, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (i = 0; i < benchmark_iterations_; i++) {
+    CopyPlane(orig_y, benchmark_width_, dst_c, benchmark_width_,
+              benchmark_width_, benchmark_height_);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (i = 0; i < benchmark_iterations_; i++) {
+    CopyPlane(orig_y, benchmark_width_, dst_opt, benchmark_width_,
+              benchmark_width_, benchmark_height_);
+  }
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestCopyPlaneZero) {
+  // Test to verify copying a rect with a zero height or width does
+  // not touch destination memory.
+  uint8_t src = 42;
+  uint8_t dst = 0;
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  CopyPlane(&src, 0, &dst, 0, 0, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+
+  CopyPlane(&src, 1, &dst, 1, 1, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+
+  CopyPlane(&src, 1, &dst, 1, 0, 1);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  CopyPlane(&src, 0, &dst, 0, 0, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+
+  CopyPlane(&src, 1, &dst, 1, 1, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+
+  CopyPlane(&src, 1, &dst, 1, 0, 1);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
+  int y_plane_size = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(tile_y, tile_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  MemRandomize(tile_y, tile_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
+                benchmark_height_, 16);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
+                benchmark_height_, 16);
+  }
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height * 2;
+  int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+  align_buffer_page_end(tile_y, tile_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+
+  MemRandomize(tile_y, tile_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
+                   benchmark_width_, benchmark_width_, benchmark_height_, 16);
+  }
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
+// Compares DetileSplitUV to 2 step Detile + SplitUV
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
+  int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
+  align_buffer_page_end(tile_uv, tile_plane_size);
+  align_buffer_page_end(detiled_uv, tile_plane_size);
+  align_buffer_page_end(dst_u_two_stage, uv_plane_size);
+  align_buffer_page_end(dst_u_opt, uv_plane_size);
+  align_buffer_page_end(dst_v_two_stage, uv_plane_size);
+  align_buffer_page_end(dst_v_opt, uv_plane_size);
+
+  MemRandomize(tile_uv, tile_plane_size);
+  memset(detiled_uv, 0, tile_plane_size);
+  memset(dst_u_two_stage, 0, uv_plane_size);
+  memset(dst_u_opt, 0, uv_plane_size);
+  memset(dst_v_two_stage, 0, uv_plane_size);
+  memset(dst_v_opt, 0, uv_plane_size);
+
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
+                     dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
+                     benchmark_height_, 16);
+
+  // Benchmark 2 step conversion for comparison.
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
+                benchmark_width_, benchmark_height_, 16);
+    SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
+                 (benchmark_width_ + 1) / 2, dst_v_two_stage,
+                 (benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
+                 benchmark_height_);
+  }
+
+  for (i = 0; i < uv_plane_size; ++i) {
+    EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
+    EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_uv);
+  free_aligned_buffer_page_end(detiled_uv);
+  free_aligned_buffer_page_end(dst_u_two_stage);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_two_stage);
+  free_aligned_buffer_page_end(dst_v_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int tile_width = (benchmark_width_ + 15) & ~15;
+  int tile_height = (benchmark_height_ + 15) & ~15;
+  int tile_plane_size = tile_width * tile_height;
+  int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
+  align_buffer_page_end(tile_uv, tile_plane_size);
+  align_buffer_page_end(dst_u_c, uv_plane_size);
+  align_buffer_page_end(dst_u_opt, uv_plane_size);
+  align_buffer_page_end(dst_v_c, uv_plane_size);
+  align_buffer_page_end(dst_v_opt, uv_plane_size);
+
+  MemRandomize(tile_uv, tile_plane_size);
+  memset(dst_u_c, 0, uv_plane_size);
+  memset(dst_u_opt, 0, uv_plane_size);
+  memset(dst_v_c, 0, uv_plane_size);
+  memset(dst_v_opt, 0, uv_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+
+  DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
+                     dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
+                     benchmark_height_, 16);
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetileSplitUVPlane(
+        tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+        (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
+  }
+
+  for (i = 0; i < uv_plane_size; ++i) {
+    EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);
+    EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(tile_uv);
+  free_aligned_buffer_page_end(dst_u_c);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_c);
+  free_aligned_buffer_page_end(dst_v_opt);
+}
+
 static int TestMultiply(int width,
                         int height,
                         int benchmark_iterations,
@@ -1447,7 +1903,7 @@ static int TestMultiply(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
-  int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
@@ -1522,7 +1978,7 @@ static int TestAdd(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
   int max_diff =
-      TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
 }
@@ -1595,7 +2051,7 @@ static int TestSubtract(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
-  int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 1);
@@ -1668,7 +2124,7 @@ static int TestSobel(int width,
 
 TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
   int max_diff =
-      TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
 }
@@ -1741,7 +2197,7 @@ static int TestSobelToPlane(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
-  int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
@@ -1813,7 +2269,7 @@ static int TestSobelXY(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
-  int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_,
                              benchmark_iterations_, disable_cpu_flags_,
                              benchmark_cpu_info_, +1, 0);
   EXPECT_EQ(0, max_diff);
@@ -1889,29 +2345,35 @@ static int TestBlur(int width,
   return max_diff;
 }
 
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+#define DISABLED_ARM(name) name
+#else
+#define DISABLED_ARM(name) DISABLED_##name
+#endif
+
 static const int kBlurSize = 55;
-TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) {
   int max_diff =
-      TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
@@ -1919,35 +2381,35 @@ TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
 }
 
 static const int kBlurSmallSize = 5;
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) {
   int max_diff =
-      TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_,
+      TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
   EXPECT_LE(max_diff, 1);
 }
 
-TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
   SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
   SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
@@ -2333,12 +2795,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
   MaskCpuFlags(disable_cpu_flags_);
   ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
                    benchmark_width_, benchmark_width_, benchmark_height_);
-  MaskCpuFlags(benchmark_cpu_info_);
+  double c_time = get_time();
+  ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+                   benchmark_width_, benchmark_width_, benchmark_height_);
+  c_time = (get_time() - c_time);
 
+  MaskCpuFlags(benchmark_cpu_info_);
+  ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+                   benchmark_width_, benchmark_width_, benchmark_height_);
+  double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i) {
     ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
                      benchmark_width_, benchmark_width_, benchmark_height_);
   }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  // Report performance of C vs OPT
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
@@ -2361,12 +2834,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
   MaskCpuFlags(disable_cpu_flags_);
   ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
                    benchmark_width_ * 4, benchmark_width_, benchmark_height_);
-  MaskCpuFlags(benchmark_cpu_info_);
+  double c_time = get_time();
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
+                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  c_time = (get_time() - c_time);
 
+  MaskCpuFlags(benchmark_cpu_info_);
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
+                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  double opt_time = get_time();
   for (int i = 0; i < benchmark_iterations_; ++i) {
     ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
                      benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+  // Report performance of C vs OPT
+  printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels * 4; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
@@ -2426,7 +2911,7 @@ static int TestARGBRect(int width,
 }
 
 TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
-  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 4);
   EXPECT_EQ(0, max_diff);
@@ -2454,7 +2939,7 @@ TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Any) {
-  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+  int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 1);
   EXPECT_EQ(0, max_diff);
@@ -2483,33 +2968,24 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
 
 TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
   const int kPixels = benchmark_width_ * benchmark_height_;
-  align_buffer_page_end(src_pixels, kPixels * 2);
-  align_buffer_page_end(tmp_pixels_u, kPixels);
-  align_buffer_page_end(tmp_pixels_v, kPixels);
+  align_buffer_page_end(src_pixels_u, kPixels);
+  align_buffer_page_end(src_pixels_v, kPixels);
   align_buffer_page_end(dst_pixels_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_c, kPixels * 2);
 
-  MemRandomize(src_pixels, kPixels * 2);
-  MemRandomize(tmp_pixels_u, kPixels);
-  MemRandomize(tmp_pixels_v, kPixels);
+  MemRandomize(src_pixels_u, kPixels);
+  MemRandomize(src_pixels_v, kPixels);
   MemRandomize(dst_pixels_opt, kPixels * 2);
   MemRandomize(dst_pixels_c, kPixels * 2);
 
   MaskCpuFlags(disable_cpu_flags_);
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+  MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
                dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
                benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-
   for (int i = 0; i < benchmark_iterations_; ++i) {
-    MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
+    MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
                  dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
                  benchmark_height_);
   }
@@ -2518,9 +2994,43 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
-  free_aligned_buffer_page_end(src_pixels);
-  free_aligned_buffer_page_end(tmp_pixels_u);
-  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// 16 bit channel split and merge
+TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_u, kPixels * 2);
+  align_buffer_page_end(src_pixels_v, kPixels * 2);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
+  MemRandomize(src_pixels_u, kPixels * 2);
+  MemRandomize(src_pixels_v, kPixels * 2);
+  MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_c, kPixels * 2 * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+                  (const uint16_t*)src_pixels_v, benchmark_width_,
+                  (uint16_t*)dst_pixels_c, benchmark_width_ * 2,
+                  benchmark_width_, benchmark_height_, 12);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
+                    (const uint16_t*)src_pixels_v, benchmark_width_,
+                    (uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
+                    benchmark_width_, benchmark_height_, 12);
+  }
+
+  for (int i = 0; i < kPixels * 2 * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
   free_aligned_buffer_page_end(dst_pixels_opt);
   free_aligned_buffer_page_end(dst_pixels_c);
 }
@@ -2528,47 +3038,112 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 2);
-  align_buffer_page_end(tmp_pixels_u, kPixels);
-  align_buffer_page_end(tmp_pixels_v, kPixels);
+  align_buffer_page_end(dst_pixels_u_c, kPixels);
+  align_buffer_page_end(dst_pixels_v_c, kPixels);
+  align_buffer_page_end(dst_pixels_u_opt, kPixels);
+  align_buffer_page_end(dst_pixels_v_opt, kPixels);
+
+  MemRandomize(src_pixels, kPixels * 2);
+  MemRandomize(dst_pixels_u_c, kPixels);
+  MemRandomize(dst_pixels_v_c, kPixels);
+  MemRandomize(dst_pixels_u_opt, kPixels);
+  MemRandomize(dst_pixels_v_opt, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c,
+               benchmark_width_, dst_pixels_v_c, benchmark_width_,
+               benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt,
+                 benchmark_width_, dst_pixels_v_opt, benchmark_width_,
+                 benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_u_c);
+  free_aligned_buffer_page_end(dst_pixels_v_c);
+  free_aligned_buffer_page_end(dst_pixels_u_opt);
+  free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
+// 16 bit channel split
+TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
+  MemRandomize(src_pixels, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_u_c, kPixels * 2);
+  MemRandomize(dst_pixels_v_c, kPixels * 2);
+  MemRandomize(dst_pixels_u_opt, kPixels * 2);
+  MemRandomize(dst_pixels_v_opt, kPixels * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                  (uint16_t*)dst_pixels_u_c, benchmark_width_,
+                  (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
+                  benchmark_height_, 10);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                    (uint16_t*)dst_pixels_u_opt, benchmark_width_,
+                    (uint16_t*)dst_pixels_v_opt, benchmark_width_,
+                    benchmark_width_, benchmark_height_, 10);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_u_c);
+  free_aligned_buffer_page_end(dst_pixels_v_c);
+  free_aligned_buffer_page_end(dst_pixels_u_opt);
+  free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
+TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
+  // Round count up to multiple of 16
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(dst_pixels_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_c, kPixels * 2);
 
   MemRandomize(src_pixels, kPixels * 2);
-  MemRandomize(tmp_pixels_u, kPixels);
-  MemRandomize(tmp_pixels_v, kPixels);
   MemRandomize(dst_pixels_opt, kPixels * 2);
   MemRandomize(dst_pixels_c, kPixels * 2);
 
   MaskCpuFlags(disable_cpu_flags_);
-  SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
-               tmp_pixels_v, benchmark_width_, benchmark_width_,
-               benchmark_height_);
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
-               dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
-               benchmark_height_);
+  SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+              benchmark_width_ * 2, benchmark_width_, benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
-    SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
-                 benchmark_width_, tmp_pixels_v, benchmark_width_,
-                 benchmark_width_, benchmark_height_);
+    SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
   }
-  MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
-               dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
-               benchmark_height_);
 
   for (int i = 0; i < kPixels * 2; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
-  free_aligned_buffer_page_end(tmp_pixels_u);
-  free_aligned_buffer_page_end(tmp_pixels_v);
   free_aligned_buffer_page_end(dst_pixels_opt);
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
 TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
+  // Round count up to multiple of 16
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
@@ -2617,6 +3192,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
+  // Round count up to multiple of 16
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
@@ -2663,10 +3239,373 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
+TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 4);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(tmp_pixels_a, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(src_pixels, kPixels * 4);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(tmp_pixels_a, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  MemRandomize(dst_pixels_c, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 benchmark_width_, benchmark_height_);
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
+                 benchmark_height_);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+                   benchmark_width_, tmp_pixels_b, benchmark_width_,
+                   tmp_pixels_a, benchmark_width_, dst_pixels_opt,
+                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(tmp_pixels_a);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 4);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(tmp_pixels_a, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(src_pixels, kPixels * 4);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(tmp_pixels_a, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  MemRandomize(dst_pixels_c, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 benchmark_width_, benchmark_height_);
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
+                 benchmark_height_);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                   benchmark_width_, tmp_pixels_g, benchmark_width_,
+                   tmp_pixels_b, benchmark_width_, tmp_pixels_a,
+                   benchmark_width_, benchmark_width_, benchmark_height_);
+  }
+
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
+                 dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
+                 benchmark_height_);
+
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(tmp_pixels_a);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 4);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(src_pixels, kPixels * 4);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  MemRandomize(dst_pixels_c, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, NULL, 0, benchmark_width_,
+                 benchmark_height_);
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
+                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, NULL, 0, benchmark_width_,
+                 benchmark_height_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+                   benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0,
+                   dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
+                   benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels, kPixels * 4);
+  align_buffer_page_end(tmp_pixels_r, kPixels);
+  align_buffer_page_end(tmp_pixels_g, kPixels);
+  align_buffer_page_end(tmp_pixels_b, kPixels);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
+  align_buffer_page_end(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(src_pixels, kPixels * 4);
+  MemRandomize(tmp_pixels_r, kPixels);
+  MemRandomize(tmp_pixels_g, kPixels);
+  MemRandomize(tmp_pixels_b, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  MemRandomize(dst_pixels_c, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+                 benchmark_width_, NULL, 0, benchmark_width_,
+                 benchmark_height_);
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
+                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
+                   benchmark_width_, tmp_pixels_g, benchmark_width_,
+                   tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_,
+                   benchmark_height_);
+  }
+
+  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt,
+                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_r);
+  free_aligned_buffer_page_end(tmp_pixels_g);
+  free_aligned_buffer_page_end(tmp_pixels_b);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// Merge 4 channels
+#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)      \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) {                        \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF);              \
+    memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE));                 \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4,     \
+                kWidth, NEG benchmark_height_, DEPTH);                      \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \
+                  kWidth, NEG benchmark_height_, DEPTH);                    \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(src_memory_a);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+// Merge 3 channel RGB into 4 channel XRGB with opaque alpha
+#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)     \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) {                 \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE));                 \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth,          \
+                NEG benchmark_height_, DEPTH);                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth,      \
+                  NEG benchmark_height_, DEPTH);                            \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH)                              \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +,  \
+                  2)                                                           \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0)  \
+  TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)     \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +,   \
+                   0)                                                          \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
+                   2)                                                          \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
+  TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12)
+TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12)
+TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
+
+#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF)      \
+  TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) {                        \
+    const int kWidth = W1280;                                               \
+    const int kPixels = kWidth * benchmark_height_;                         \
+    align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF);     \
+    align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE));       \
+    align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE));     \
+    MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF);              \
+    MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF);              \
+    STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF);     \
+    STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF);     \
+    STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF);     \
+    DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c);           \
+    DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt);       \
+    memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE));                   \
+    memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE));                 \
+    MaskCpuFlags(disable_cpu_flags_);                                       \
+    FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b,   \
+                kWidth, dst_pixels_c, kWidth * 4, kWidth,                   \
+                NEG benchmark_height_, DEPTH);                              \
+    MaskCpuFlags(benchmark_cpu_info_);                                      \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
+      FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
+                  kWidth, dst_pixels_opt, kWidth * 4, kWidth,               \
+                  NEG benchmark_height_, DEPTH);                            \
+    }                                                                       \
+    for (int i = 0; i < kPixels * 4; ++i) {                                 \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+    }                                                                       \
+    free_aligned_buffer_page_end(src_memory_r);                             \
+    free_aligned_buffer_page_end(src_memory_g);                             \
+    free_aligned_buffer_page_end(src_memory_b);                             \
+    free_aligned_buffer_page_end(dst_memory_c);                             \
+    free_aligned_buffer_page_end(dst_memory_opt);                           \
+  }
+
+#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH)                              \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +,  \
+                  2)                                                           \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0)  \
+  TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
+
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
+TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
+
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 8
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
   align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -2679,19 +3618,19 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 
   MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
                   reinterpret_cast<const uint16_t*>(src_pixels_v),
-                  reinterpret_cast<uint16_t*>(dst_pixels_uv_c), 64, kPixels);
+                  reinterpret_cast<uint16_t*>(dst_pixels_uv_c), 16, kPixels);
 
   int has_avx2 = TestCpuFlag(kCpuHasAVX2);
   for (int i = 0; i < benchmark_iterations_; ++i) {
     if (has_avx2) {
       MergeUVRow_16_AVX2(reinterpret_cast<const uint16_t*>(src_pixels_u),
                          reinterpret_cast<const uint16_t*>(src_pixels_v),
-                         reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+                         reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 16,
                          kPixels);
     } else {
       MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
                       reinterpret_cast<const uint16_t*>(src_pixels_v),
-                      reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+                      reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 16,
                       kPixels);
     }
   }
@@ -2710,7 +3649,9 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_MULTIPLYROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 32
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2776,6 +3717,65 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+TEST_F(LibYUVPlanarTest, YUY2ToY) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  YUY2ToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_c, benchmark_width_,
+          benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    YUY2ToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_opt,
+            benchmark_width_, benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+TEST_F(LibYUVPlanarTest, UYVYToY) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  UYVYToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_c, benchmark_width_,
+          benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    UYVYToY(src_pixels_y, benchmark_width_ * 2, dst_pixels_y_opt,
+            benchmark_width_, benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT16TO8ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
@@ -2822,6 +3822,36 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
 
+#ifdef HAS_UYVYTOYROW_NEON
+TEST_F(LibYUVPlanarTest, UYVYToYRow_Opt) {
+  // NEON does multiple of 16, so round count up
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  UYVYToYRow_C(src_pixels_y, dst_pixels_y_c, kPixels);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    UYVYToYRow_NEON(src_pixels_y, dst_pixels_y_opt, kPixels);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#endif  // ENABLE_ROW_TESTS
+
 TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels_y, kPixels);
@@ -2855,6 +3885,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT8TO16ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
@@ -3173,33 +4204,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
 extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
 
 TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
-  SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
+  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
 
   memset(orig_pixels, 0, sizeof(orig_pixels));
   memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
   memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-  for (int i = 0; i < 640 + 4; ++i) {
+  for (int i = 0; i < 1280 + 8; ++i) {
     orig_pixels[i] = i * 256;
   }
-  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
-      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     } else {
-      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     }
 #else
-    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
 #endif
   }
 
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
@@ -3225,141 +4256,139 @@ extern "C" void GaussCol_C(const uint16_t* src0,
                            int width);
 
 TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
-  SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
-  SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
-  SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+  SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
+  SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
 
   memset(orig_pixels, 0, sizeof(orig_pixels));
   memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
   memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-  for (int i = 0; i < 640 * 5; ++i) {
-    orig_pixels[i] = i;
+  for (int i = 0; i < 1280 * 5; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
   }
-  GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-             &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
-             640);
-  for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+  GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+             &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
+             1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
-      GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-                    &orig_pixels[640 * 3], &orig_pixels[640 * 4],
-                    &dst_pixels_opt[0], 640);
+      GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                    &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                    &dst_pixels_opt[0], 1280);
     } else {
-      GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-                 &orig_pixels[640 * 3], &orig_pixels[640 * 4],
-                 &dst_pixels_opt[0], 640);
+      GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_opt[0], 1280);
     }
 #else
-    GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
-               &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
-               640);
+    GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+               &dst_pixels_opt[0], 1280);
 #endif
   }
 
-  for (int i = 0; i < 640; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
-
-  EXPECT_EQ(dst_pixels_c[0],
-            static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
-                                  640 * 4 * 1));
-  EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
 }
 
-float TestFloatDivToByte(int benchmark_width,
-                         int benchmark_height,
-                         int benchmark_iterations,
-                         float scale,
-                         bool opt) {
-  int i, j;
-  // NEON does multiple of 8, so round count up
-  const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
-  align_buffer_page_end(src_weights, kPixels * 4);
-  align_buffer_page_end(src_values, kPixels * 4);
-  align_buffer_page_end(dst_out_c, kPixels);
-  align_buffer_page_end(dst_out_opt, kPixels);
-  align_buffer_page_end(dst_mask_c, kPixels);
-  align_buffer_page_end(dst_mask_opt, kPixels);
-
-  // Randomize works but may contain some denormals affecting performance.
-  // MemRandomize(orig_y, kPixels * 4);
-  // large values are problematic.  audio is really -1 to 1.
-  for (i = 0; i < kPixels; ++i) {
-    (reinterpret_cast<float*>(src_weights))[i] = scale;
-    (reinterpret_cast<float*>(src_values))[i] =
-        sinf(static_cast<float>(i) * 0.1f);
-  }
-  memset(dst_out_c, 0, kPixels);
-  memset(dst_out_opt, 1, kPixels);
-  memset(dst_mask_c, 2, kPixels);
-  memset(dst_mask_opt, 3, kPixels);
+TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
+  SIMD_ALIGNED(float orig_pixels[1280 + 4]);
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
 
-  FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                      reinterpret_cast<float*>(src_values), dst_out_c,
-                      dst_mask_c, kPixels);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-  for (j = 0; j < benchmark_iterations; j++) {
-    if (opt) {
-#ifdef HAS_FLOATDIVTOBYTEROW_NEON
-      FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
-                             reinterpret_cast<float*>(src_values), dst_out_opt,
-                             dst_mask_opt, kPixels);
-#else
-      FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                          reinterpret_cast<float*>(src_values), dst_out_opt,
-                          dst_mask_opt, kPixels);
-#endif
+  for (int i = 0; i < 1280 + 4; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+  GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     } else {
-      FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                          reinterpret_cast<float*>(src_values), dst_out_opt,
-                          dst_mask_opt, kPixels);
+      GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
     }
+#else
+    GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+#endif
   }
 
-  uint8_t max_diff = 0;
-  for (i = 0; i < kPixels; ++i) {
-    uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) +
-                       abs(dst_mask_c[i] - dst_mask_opt[i]);
-    if (abs_diff > max_diff) {
-      max_diff = abs_diff;
-    }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
+}
 
-  free_aligned_buffer_page_end(src_weights);
-  free_aligned_buffer_page_end(src_values);
-  free_aligned_buffer_page_end(dst_out_c);
-  free_aligned_buffer_page_end(dst_out_opt);
-  free_aligned_buffer_page_end(dst_mask_c);
-  free_aligned_buffer_page_end(dst_mask_opt);
+TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
+  SIMD_ALIGNED(float dst_pixels_c[1280]);
+  SIMD_ALIGNED(float dst_pixels_opt[1280]);
+  align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4);  // 5 rows
+  float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
 
-  return max_diff;
-}
+  memset(orig_pixels, 0, 1280 * 5 * 4);
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
 
-TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) {
-  float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
-                                  benchmark_iterations_, 1.2f, false);
-  EXPECT_EQ(0, diff);
-}
+  for (int i = 0; i < 1280 * 5; ++i) {
+    orig_pixels[i] = static_cast<float>(i);
+  }
+  GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+                        &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                        &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+    } else {
+      GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+                     &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                     &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
+    }
+#else
+    GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                   &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                   &dst_pixels_opt[0], 1280);
+#endif
+  }
 
-TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
-  float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
-                                  benchmark_iterations_, 1.2f, true);
-  EXPECT_EQ(0, diff);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(orig_pixels_buf);
 }
 
-TEST_F(LibYUVPlanarTest, UVToVURow) {
+TEST_F(LibYUVPlanarTest, SwapUVRow) {
   const int kPixels = benchmark_width_ * benchmark_height_;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+
   align_buffer_page_end(src_pixels_vu, kPixels * 2);
   align_buffer_page_end(dst_pixels_uv, kPixels * 2);
-
   MemRandomize(src_pixels_vu, kPixels * 2);
   memset(dst_pixels_uv, 1, kPixels * 2);
 
-  UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(kPixels, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
 
+  for (int j = 0; j < benchmark_iterations_; j++) {
+    SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
+  }
   for (int i = 0; i < kPixels; ++i) {
     EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
     EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
@@ -3368,5 +4397,223 @@ TEST_F(LibYUVPlanarTest, UVToVURow) {
   free_aligned_buffer_page_end(src_pixels_vu);
   free_aligned_buffer_page_end(dst_pixels_uv);
 }
+#endif  // ENABLE_ROW_TESTS
+
+TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_pixels_opt, kSize);
+  align_buffer_page_end(dst_pixels_c, kSize);
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
+  }
+  memset(dst_pixels_opt, 1, kSize);
+  memset(dst_pixels_c, 2, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                 (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+                 benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
+                   (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+                   benchmark_height_);
+  }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+        << i;
+  }
+
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+  int dst_width = (benchmark_width_ + 1) / 2;
+  int dst_height = (benchmark_height_ + 1) / 2;
+  align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+  align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+  align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+  MemRandomize(tmp_pixels_u, dst_width * dst_height);
+  MemRandomize(tmp_pixels_v, dst_width * dst_height);
+  MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                   benchmark_width_, dst_pixels_uv_c, dst_width * 2,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                     benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+                     benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+    EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(tmp_pixels_u);
+  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
+TEST_F(LibYUVPlanarTest, NV12Copy) {
+  const int halfwidth = (benchmark_width_ + 1) >> 1;
+  const int halfheight = (benchmark_height_ + 1) >> 1;
+  align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_uv, halfwidth * 2 * halfheight);
+  align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight);
+
+  MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_uv, halfwidth * 2 * halfheight);
+  MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(dst_uv, halfwidth * 2 * halfheight);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y,
+             benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_,
+             benchmark_height_);
+  }
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(src_y[i], dst_y[i]);
+  }
+  for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+    EXPECT_EQ(src_uv[i], dst_uv[i]);
+  }
+
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_uv);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVPlanarTest, NV21Copy) {
+  const int halfwidth = (benchmark_width_ + 1) >> 1;
+  const int halfheight = (benchmark_height_ + 1) >> 1;
+  align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_vu, halfwidth * 2 * halfheight);
+  align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight);
+
+  MemRandomize(src_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_vu, halfwidth * 2 * halfheight);
+  MemRandomize(dst_y, benchmark_width_ * benchmark_height_);
+  MemRandomize(dst_vu, halfwidth * 2 * halfheight);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y,
+             benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_,
+             benchmark_height_);
+  }
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(src_y[i], dst_y[i]);
+  }
+  for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
+    EXPECT_EQ(src_vu[i], dst_vu[i]);
+  }
+
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_vu);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_vu);
+}
+
+#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \
+    defined(__aarch64__)
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+  align_buffer_page_end(orig_f, y_plane_size * 4);
+  align_buffer_page_end(orig_y, y_plane_size * 2);
+  align_buffer_page_end(dst_opt, y_plane_size * 4);
+  align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+  }
+  memset(orig_y, 1, y_plane_size * 2);
+  memset(dst_opt, 2, y_plane_size * 4);
+  memset(rec_opt, 3, y_plane_size * 2);
+
+  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+                            y_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt,
+                              y_plane_size);
+  }
+
+  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+                            y_plane_size);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_f);
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(rec_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_;
+
+  align_buffer_page_end(orig_f, y_plane_size * 4);
+  align_buffer_page_end(orig_y, y_plane_size * 2);
+  align_buffer_page_end(dst_opt, y_plane_size * 4);
+  align_buffer_page_end(rec_opt, y_plane_size * 2);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    ((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
+  }
+  memset(orig_y, 1, y_plane_size * 2);
+  memset(dst_opt, 2, y_plane_size * 4);
+  memset(rec_opt, 3, y_plane_size * 2);
+
+  ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
+                            y_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    ConvertFP16ToFP32Column_NEON((const uint16_t*)orig_y, 1, (float*)dst_opt,
+                                 y_plane_size);
+  }
+
+  ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
+                            y_plane_size);
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_f);
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_opt);
+  free_aligned_buffer_page_end(rec_opt);
+}
+
+#endif  // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
 
 }  // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index d2003895..74952c4e 100644
--- a/files/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -156,31 +156,179 @@ TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
 TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
-  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
-                  benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+  TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1,
+                  benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
                   benchmark_iterations_, disable_cpu_flags_,
                   benchmark_cpu_info_);
 }
 
+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+  int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+  align_buffer_page_end(src_argb, argb_plane_size);
+  align_buffer_page_end(dst_argb, argb_plane_size);
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate90));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate90));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate270));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate270));
+
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(src_argb);
+}
+
+static void TestRotatePlane_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               libyuv::RotationMode mode,
+                               int benchmark_iterations,
+                               int disable_cpu_flags,
+                               int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height < 1) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_stride = src_width;
+  int src_plane_size = src_stride * abs(src_height);
+  align_buffer_page_end_16(src, src_plane_size);
+  for (int i = 0; i < src_plane_size; ++i) {
+    src[i] = fastrand() & 0xff;
+  }
+
+  int dst_stride = dst_width;
+  int dst_plane_size = dst_stride * dst_height;
+  align_buffer_page_end_16(dst_c, dst_plane_size);
+  align_buffer_page_end_16(dst_opt, dst_plane_size);
+  memset(dst_c, 2, dst_plane_size);
+  memset(dst_opt, 3, dst_plane_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  RotatePlane_16(src, src_stride, dst_c, dst_stride, src_width, src_height,
+                 mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    RotatePlane_16(src, src_stride, dst_opt, dst_stride, src_width, src_height,
+                   mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_c);
+  free_aligned_buffer_page_end_16(dst_opt);
+  free_aligned_buffer_page_end_16(src);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate0, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate90, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_,
+                     benchmark_height_, kRotate180, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Opt) {
+  TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_,
+                     benchmark_width_, kRotate270, benchmark_iterations_,
+                     disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_16_Odd) {
+  TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1,
+                     benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                     benchmark_iterations_, disable_cpu_flags_,
+                     benchmark_cpu_info_);
+}
+
 }  // namespace libyuv
diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
new file mode 100644
index 00000000..abc08efa
--- /dev/null
+++ b/unit_test/rotate_test.cc
@@ -0,0 +1,962 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/rotate_row.h"
+#endif
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+
+static void I420TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i420_y_size = src_width * Abs(src_height);
+  int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+  align_buffer_page_end(src_i420, src_i420_size);
+  for (int i = 0; i < src_i420_size; ++i) {
+    src_i420[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_page_end(dst_i420_c, dst_i420_size);
+  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size,
+             (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size,
+             (src_width + 1) / 2, dst_i420_c, dst_width,
+             dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+             dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Rotate(
+        src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2,
+        src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+        dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size,
+        (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i420_c);
+  free_aligned_buffer_page_end(dst_i420_opt);
+  free_aligned_buffer_page_end(src_i420);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+  I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+static void I422TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i422_y_size = src_width * Abs(src_height);
+  int src_i422_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+  int src_i422_size = src_i422_y_size + src_i422_uv_size * 2;
+  align_buffer_page_end(src_i422, src_i422_size);
+  for (int i = 0; i < src_i422_size; ++i) {
+    src_i422[i] = fastrand() & 0xff;
+  }
+
+  int dst_i422_y_size = dst_width * dst_height;
+  int dst_i422_uv_size = ((dst_width + 1) / 2) * dst_height;
+  int dst_i422_size = dst_i422_y_size + dst_i422_uv_size * 2;
+  align_buffer_page_end(dst_i422_c, dst_i422_size);
+  align_buffer_page_end(dst_i422_opt, dst_i422_size);
+  memset(dst_i422_c, 2, dst_i422_size);
+  memset(dst_i422_opt, 3, dst_i422_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I422Rotate(src_i422, src_width, src_i422 + src_i422_y_size,
+             (src_width + 1) / 2, src_i422 + src_i422_y_size + src_i422_uv_size,
+             (src_width + 1) / 2, dst_i422_c, dst_width,
+             dst_i422_c + dst_i422_y_size, (dst_width + 1) / 2,
+             dst_i422_c + dst_i422_y_size + dst_i422_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I422Rotate(
+        src_i422, src_width, src_i422 + src_i422_y_size, (src_width + 1) / 2,
+        src_i422 + src_i422_y_size + src_i422_uv_size, (src_width + 1) / 2,
+        dst_i422_opt, dst_width, dst_i422_opt + dst_i422_y_size,
+        (dst_width + 1) / 2, dst_i422_opt + dst_i422_y_size + dst_i422_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i422_size; ++i) {
+    EXPECT_EQ(dst_i422_c[i], dst_i422_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i422_c);
+  free_aligned_buffer_page_end(dst_i422_opt);
+  free_aligned_buffer_page_end(src_i422);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate0_Opt) {
+  I422TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate90_Opt) {
+  I422TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate180_Opt) {
+  I422TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I422Rotate270_Opt) {
+  I422TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I444TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i444_y_size = src_width * Abs(src_height);
+  int src_i444_uv_size = src_width * Abs(src_height);
+  int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+  align_buffer_page_end(src_i444, src_i444_size);
+  for (int i = 0; i < src_i444_size; ++i) {
+    src_i444[i] = fastrand() & 0xff;
+  }
+
+  int dst_i444_y_size = dst_width * dst_height;
+  int dst_i444_uv_size = dst_width * dst_height;
+  int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+  align_buffer_page_end(dst_i444_c, dst_i444_size);
+  align_buffer_page_end(dst_i444_opt, dst_i444_size);
+  memset(dst_i444_c, 2, dst_i444_size);
+  memset(dst_i444_opt, 3, dst_i444_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+             src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+             dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+             dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+               src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+               dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+               dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i444_size; ++i) {
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i444_c);
+  free_aligned_buffer_page_end(dst_i444_opt);
+  free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+  I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+static void NV12TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {  // allow negative for inversion test.
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_nv12_y_size = src_width * Abs(src_height);
+  int src_nv12_uv_size =
+      ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+  int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+  align_buffer_page_end(src_nv12, src_nv12_size);
+  for (int i = 0; i < src_nv12_size; ++i) {
+    src_nv12[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_page_end(dst_i420_c, dst_i420_size);
+  align_buffer_page_end(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+                   (src_width + 1) & ~1, dst_i420_c, dst_width,
+                   dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+                   dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+                   (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size,
+                     (src_width + 1) & ~1, dst_i420_opt, dst_width,
+                     dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+                     dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+                     (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i420_c);
+  free_aligned_buffer_page_end(dst_i420_opt);
+  free_aligned_buffer_page_end(src_nv12);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_width_ + 1, benchmark_height_ + 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+  NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1,
+                 benchmark_height_ + 1, benchmark_width_ + 1, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// Test Android 420 to I420 Rotate
+#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
+                        W1280, N, NEG, OFF, PN, OFF_U, OFF_V, ROT)            \
+  TEST_F(LibYUVRotateTest,                                                    \
+         SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate##ROT##To##PN##N) {            \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSizeUV =                                                       \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_uv,                                             \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
+                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
+                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    uint8_t* src_u = src_uv + OFF_U;                                          \
+    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
+    int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
+        src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+        src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] =                 \
+            (fastrand() & 0xff);                                              \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_u_c, 2,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_c, 3,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_u_opt, 102,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    memset(dst_v_opt, 103,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate(                                   \
+        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
+        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \
+        kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,               \
+        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight,                    \
+        (libyuv::RotationMode)ROT);                                           \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate(                                 \
+          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE,        \
+          dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),         \
+          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight,       \
+          (libyuv::RotationMode)ROT);                                         \
+    }                                                                         \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth; ++j) {                                      \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V,         \
+                       SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X,    \
+                       SUBSAMP_Y)                                              \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1,      \
+                  _Any, +, 0, PN, OFF_U, OFF_V, 0)                             \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_,          \
+                  _Unaligned, +, 2, PN, OFF_U, OFF_V, 0)                       \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \
+                  -, 0, PN, OFF_U, OFF_V, 0)                                   \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V, 0)                                      \
+  TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,  \
+                  FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \
+                  0, PN, OFF_U, OFF_V, 180)
+
+TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
+TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+#undef TESTAPLANARTOP
+#undef TESTAPLANARTOPI
+
+static void I010TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i010_y_size = src_width * Abs(src_height);
+  int src_i010_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i010_size = src_i010_y_size + src_i010_uv_size * 2;
+  align_buffer_page_end_16(src_i010, src_i010_size);
+  for (int i = 0; i < src_i010_size; ++i) {
+    src_i010[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i010_y_size = dst_width * dst_height;
+  int dst_i010_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i010_size = dst_i010_y_size + dst_i010_uv_size * 2;
+  align_buffer_page_end_16(dst_i010_c, dst_i010_size);
+  align_buffer_page_end_16(dst_i010_opt, dst_i010_size);
+  memset(dst_i010_c, 2, dst_i010_size * 2);
+  memset(dst_i010_opt, 3, dst_i010_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I010Rotate(src_i010, src_width, src_i010 + src_i010_y_size,
+             (src_width + 1) / 2, src_i010 + src_i010_y_size + src_i010_uv_size,
+             (src_width + 1) / 2, dst_i010_c, dst_width,
+             dst_i010_c + dst_i010_y_size, (dst_width + 1) / 2,
+             dst_i010_c + dst_i010_y_size + dst_i010_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I010Rotate(
+        src_i010, src_width, src_i010 + src_i010_y_size, (src_width + 1) / 2,
+        src_i010 + src_i010_y_size + src_i010_uv_size, (src_width + 1) / 2,
+        dst_i010_opt, dst_width, dst_i010_opt + dst_i010_y_size,
+        (dst_width + 1) / 2, dst_i010_opt + dst_i010_y_size + dst_i010_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i010_size; ++i) {
+    EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i010_c);
+  free_aligned_buffer_page_end_16(dst_i010_opt);
+  free_aligned_buffer_page_end_16(src_i010);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate0_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate90_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate180_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I010Rotate270_Opt) {
+  I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I210TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i210_y_size = src_width * Abs(src_height);
+  int src_i210_uv_size = ((src_width + 1) / 2) * Abs(src_height);
+  int src_i210_size = src_i210_y_size + src_i210_uv_size * 2;
+  align_buffer_page_end_16(src_i210, src_i210_size);
+  for (int i = 0; i < src_i210_size; ++i) {
+    src_i210[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i210_y_size = dst_width * dst_height;
+  int dst_i210_uv_size = ((dst_width + 1) / 2) * dst_height;
+  int dst_i210_size = dst_i210_y_size + dst_i210_uv_size * 2;
+  align_buffer_page_end_16(dst_i210_c, dst_i210_size);
+  align_buffer_page_end_16(dst_i210_opt, dst_i210_size);
+  memset(dst_i210_c, 2, dst_i210_size * 2);
+  memset(dst_i210_opt, 3, dst_i210_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I210Rotate(src_i210, src_width, src_i210 + src_i210_y_size,
+             (src_width + 1) / 2, src_i210 + src_i210_y_size + src_i210_uv_size,
+             (src_width + 1) / 2, dst_i210_c, dst_width,
+             dst_i210_c + dst_i210_y_size, (dst_width + 1) / 2,
+             dst_i210_c + dst_i210_y_size + dst_i210_uv_size,
+             (dst_width + 1) / 2, src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I210Rotate(
+        src_i210, src_width, src_i210 + src_i210_y_size, (src_width + 1) / 2,
+        src_i210 + src_i210_y_size + src_i210_uv_size, (src_width + 1) / 2,
+        dst_i210_opt, dst_width, dst_i210_opt + dst_i210_y_size,
+        (dst_width + 1) / 2, dst_i210_opt + dst_i210_y_size + dst_i210_uv_size,
+        (dst_width + 1) / 2, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i210_size; ++i) {
+    EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i210_c);
+  free_aligned_buffer_page_end_16(dst_i210_opt);
+  free_aligned_buffer_page_end_16(src_i210);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate0_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate90_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate180_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I210Rotate270_Opt) {
+  I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void I410TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i410_y_size = src_width * Abs(src_height);
+  int src_i410_uv_size = src_width * Abs(src_height);
+  int src_i410_size = src_i410_y_size + src_i410_uv_size * 2;
+  align_buffer_page_end_16(src_i410, src_i410_size);
+  for (int i = 0; i < src_i410_size; ++i) {
+    src_i410[i] = fastrand() & 0x3ff;
+  }
+
+  int dst_i410_y_size = dst_width * dst_height;
+  int dst_i410_uv_size = dst_width * dst_height;
+  int dst_i410_size = dst_i410_y_size + dst_i410_uv_size * 2;
+  align_buffer_page_end_16(dst_i410_c, dst_i410_size);
+  align_buffer_page_end_16(dst_i410_opt, dst_i410_size);
+  memset(dst_i410_c, 2, dst_i410_size * 2);
+  memset(dst_i410_opt, 3, dst_i410_size * 2);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+             src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+             dst_i410_c, dst_width, dst_i410_c + dst_i410_y_size, dst_width,
+             dst_i410_c + dst_i410_y_size + dst_i410_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width,
+               src_i410 + src_i410_y_size + src_i410_uv_size, src_width,
+               dst_i410_opt, dst_width, dst_i410_opt + dst_i410_y_size,
+               dst_width, dst_i410_opt + dst_i410_y_size + dst_i410_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i410_size; ++i) {
+    EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+  }
+
+  free_aligned_buffer_page_end_16(dst_i410_c);
+  free_aligned_buffer_page_end_16(dst_i410_opt);
+  free_aligned_buffer_page_end_16(src_i410);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate0_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate90_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate180_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
+  I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+#if defined(ENABLE_ROW_TESTS)
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
+  // dst width and height
+  const int width = 4;
+  const int height = 4;
+  int src_pixels[4][4];
+  int dst_pixels_c[4][4];
+  int dst_pixels_opt[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      src_pixels[i][j] = i * 10 + j;
+    }
+  }
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  const int benchmark_iterations =
+      (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
+      (4 * 4);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+    }
+  }
+}
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
+  // dst width and height
+  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
+  const int height = 4;
+  align_buffer_page_end(src_pixels, height * width * 4);
+  align_buffer_page_end(dst_pixels_c, width * height * 4);
+  align_buffer_page_end(dst_pixels_opt, width * height * 4);
+
+  MemRandomize(src_pixels, height * width * 4);
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+}
+
+#endif  // ENABLE_ROW_TESTS
+
+}  // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 94aef60e..f54a68f1 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -22,6 +22,12 @@ namespace libyuv {
 #define STRINGIZE(line) #line
 #define FILELINESTR(file, line) file ":" STRINGIZE(line)
 
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int ARGBTestFilter(int src_width,
                           int src_height,
@@ -114,8 +120,8 @@ static int ARGBTestFilter(int src_width,
   return max_diff;
 }
 
-static const int kTileX = 8;
-static const int kTileY = 8;
+static const int kTileX = 64;
+static const int kTileY = 64;
 
 static int TileARGBScale(const uint8_t* src_argb,
                          int src_stride_argb,
@@ -232,7 +238,7 @@ static int ARGBClipTestFilter(int src_width,
 #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
 #define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff)          \
   TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) {                \
     int diff = ARGBTestFilter(                                               \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
@@ -241,7 +247,7 @@ static int ARGBClipTestFilter(int src_width,
         benchmark_cpu_info_);                                                \
     EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
-  TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) {            \
+  TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \
     int diff = ARGBClipTestFilter(                                           \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
@@ -251,15 +257,30 @@ static int ARGBClipTestFilter(int src_width,
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom)         \
-  TEST_FACTOR1(name, None, nom, denom, 0)     \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
-  TEST_FACTOR1(name, Box, nom, denom, 3)
+#ifndef DISABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom)           \
+  TEST_FACTOR1(, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(, name, Box, nom, denom, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_FACTOR(name, nom, denom)                    \
+  TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3)
+#else
+#define TEST_FACTOR(name, nom, denom) \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3)
+#endif
+#endif
 
 TEST_FACTOR(2, 1, 2)
 TEST_FACTOR(4, 1, 4)
+#ifndef DISABLE_SLOW_TESTS
 TEST_FACTOR(8, 1, 8)
+#endif
 TEST_FACTOR(3by4, 3, 4)
 TEST_FACTOR(3by8, 3, 8)
 TEST_FACTOR(3, 1, 3)
@@ -268,7 +289,7 @@ TEST_FACTOR(3, 1, 3)
 #undef SX
 #undef DX
 
-#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff)        \
   TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {             \
     int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width,      \
                               height, kFilter##filter, benchmark_iterations_,  \
@@ -282,34 +303,70 @@ TEST_FACTOR(3, 1, 3)
                               benchmark_cpu_info_);                            \
     EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
-  TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) {         \
+  TEST_F(LibYUVScaleTest,                                                      \
+         DISABLED_##name##ClipTo##width##x##height##_##filter) {               \
     int diff =                                                                 \
         ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
                            kFilter##filter, benchmark_iterations_);            \
     EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
-  TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) {       \
+  TEST_F(LibYUVScaleTest,                                                      \
+         DISABLED_##name##ClipFrom##width##x##height##_##filter) {             \
     int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_),        \
                                   Abs(benchmark_height_), kFilter##filter,     \
                                   benchmark_iterations_);                      \
     EXPECT_LE(diff, max_diff);                                                 \
   }
 
-/// Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height)       \
-  TEST_SCALETO1(name, width, height, None, 0)   \
-  TEST_SCALETO1(name, width, height, Linear, 3) \
-  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#ifndef DISABLE_SLOW_TESTS
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)         \
+  TEST_SCALETO1(, name, width, height, None, 0)   \
+  TEST_SCALETO1(, name, width, height, Linear, 3) \
+  TEST_SCALETO1(, name, width, height, Bilinear, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_SCALETO(name, width, height)                  \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)   \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3)
+#endif
+#endif
 
 TEST_SCALETO(ARGBScale, 1, 1)
-TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 1280, 720)
 TEST_SCALETO(ARGBScale, 1920, 1080)
+#endif  // DISABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
+#define TEST_SCALESWAPXY1(name, filter, max_diff)                       \
+  TEST_F(LibYUVScaleTest, name##SwapXY_##filter) {                      \
+    int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,      \
+                              benchmark_height_, benchmark_width_,      \
+                              kFilter##filter, benchmark_iterations_,   \
+                              disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                          \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(ARGBScale, None, 0)
+TEST_SCALESWAPXY1(ARGBScale, Linear, 0)
+TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
 // Scale with YUV conversion to ARGB and clipping.
 // TODO(fbarchard): Add fourcc support.  All 4 ARGB formats is easy to support.
 LIBYUV_API
@@ -454,4 +511,78 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
   EXPECT_LE(diff, 10);
 }
 
+TEST_F(LibYUVScaleTest, ARGBTest3x) {
+  const int kSrcStride = 480 * 4;
+  const int kDstStride = 160 * 4;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 480 * 3; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 255 - i;
+    orig_pixels[i * 4 + 2] = i + 1;
+    orig_pixels[i * 4 + 3] = i + 10;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+              kFilterBilinear);
+  }
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);
+
+  ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+            kFilterNone);
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, ARGBTest4x) {
+  const int kSrcStride = 640 * 4;
+  const int kDstStride = 160 * 4;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 640 * 4; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = 255 - i;
+    orig_pixels[i * 4 + 2] = i + 1;
+    orig_pixels[i * 4 + 3] = i + 10;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+              kFilterBilinear);
+  }
+
+  EXPECT_NEAR(66, dest_pixels[0], 4);
+  EXPECT_NEAR(255 - 66, dest_pixels[1], 4);
+  EXPECT_NEAR(67, dest_pixels[2], 4);
+  EXPECT_NEAR(76, dest_pixels[3], 4);
+
+  ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+            kFilterNone);
+
+  EXPECT_EQ(2, dest_pixels[0]);
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(3, dest_pixels[2]);
+  EXPECT_EQ(12, dest_pixels[3]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
 }  // namespace libyuv
diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc
new file mode 100644
index 00000000..9ce47a02
--- /dev/null
+++ b/unit_test/scale_plane_test.cc
@@ -0,0 +1,470 @@
+/*
+ *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/scale_row.h"  // For ScaleRowDown2Box_Odd_C
+#endif
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#undef ENABLE_ROW_TESTS
+#define LEAN_TESTS
+#endif
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+namespace libyuv {
+
+#ifdef ENABLE_ROW_TESTS
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
+  SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
+  SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
+  SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
+  memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  if (!has_ssse3) {
+    printf("Warning SSSE3 not detected; Skipping test.\n");
+  } else {
+    // TL.
+    orig_pixels[0] = 255u;
+    orig_pixels[1] = 0u;
+    orig_pixels[128 + 0] = 0u;
+    orig_pixels[128 + 1] = 0u;
+    // TR.
+    orig_pixels[2] = 0u;
+    orig_pixels[3] = 100u;
+    orig_pixels[128 + 2] = 0u;
+    orig_pixels[128 + 3] = 0u;
+    // BL.
+    orig_pixels[4] = 0u;
+    orig_pixels[5] = 0u;
+    orig_pixels[128 + 4] = 50u;
+    orig_pixels[128 + 5] = 0u;
+    // BR.
+    orig_pixels[6] = 0u;
+    orig_pixels[7] = 0u;
+    orig_pixels[128 + 6] = 0u;
+    orig_pixels[128 + 7] = 20u;
+    // Odd.
+    orig_pixels[126] = 4u;
+    orig_pixels[127] = 255u;
+    orig_pixels[128 + 126] = 16u;
+    orig_pixels[128 + 127] = 255u;
+
+    // Test regular half size.
+    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(133u, dst_pixels_c[63]);
+
+    // Test Odd width version - Last pixel is just 1 horizontal pixel.
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(10u, dst_pixels_c[63]);
+
+    // Test one pixel less, should skip the last pixel.
+    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
+
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(0u, dst_pixels_c[63]);
+
+    // Test regular half size SSSE3.
+    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+
+    EXPECT_EQ(64u, dst_pixels_opt[0]);
+    EXPECT_EQ(25u, dst_pixels_opt[1]);
+    EXPECT_EQ(13u, dst_pixels_opt[2]);
+    EXPECT_EQ(5u, dst_pixels_opt[3]);
+    EXPECT_EQ(0u, dst_pixels_opt[4]);
+    EXPECT_EQ(133u, dst_pixels_opt[63]);
+
+    // Compare C and SSSE3 match.
+    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+    for (int i = 0; i < 64; ++i) {
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    }
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_SSSE3
+
+extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t* dst,
+                                         int dst_width);
+
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
+  SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
+  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
+
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+  for (int i = 0; i < 2560 * 2; ++i) {
+    orig_pixels[i] = i;
+  }
+  ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    if (has_neon) {
+      ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+    } else {
+      ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+    }
+#else
+    ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+#endif
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+  EXPECT_EQ(dst_pixels_c[1279], 3839);
+}
+#endif  // ENABLE_ROW_TESTS
+
+// Test scaling plane with 8 bit C vs 12 bit C and return maximum pixel
+// difference.
+// 0 = exact.
+static int TestPlaneFilter_16(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              FilterMode f,
+                              int benchmark_iterations,
+                              int disable_cpu_flags,
+                              int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int src_stride_y = Abs(src_width);
+  int dst_y_plane_size = dst_width * dst_height;
+  int dst_stride_y = dst_width;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  memset(dst_y_8, 0, dst_y_plane_size);
+  memset(dst_y_16, 1, dst_y_plane_size * 2);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i] & 255;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
+             dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+
+  for (i = 0; i < benchmark_iterations; ++i) {
+    ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
+                  dst_stride_y, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_y_16);
+
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample.
+#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+  TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \
+    int diff = TestPlaneFilter_16(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),   \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
+        benchmark_cpu_info_);                                                  \
+    EXPECT_LE(diff, max_diff);                                                 \
+  }
+
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff)      \
+  TEST_FACTOR1(name, None, nom, denom, 0)           \
+  TEST_FACTOR1(name, Linear, nom, denom, boxdiff)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
+  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+TEST_F(LibYUVScaleTest, PlaneTest3x) {
+  const int kSrcStride = 480;
+  const int kDstStride = 160;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 480 * 3; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ(225, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+             kFilterNone);
+
+  EXPECT_EQ(225, dest_pixels[0]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest4x) {
+  const int kSrcStride = 640;
+  const int kDstStride = 160;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 640 * 4; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+               kFilterBilinear);
+  }
+
+  EXPECT_EQ(66, dest_pixels[0]);
+
+  ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+             kFilterNone);
+
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterNone);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterNone);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterBilinear);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterBilinear);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+// Intent is to test 200x50 to 50x200 but width and height can be parameters.
+TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
+  const int kSize = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < kSize; ++i) {
+    orig_pixels[i] = i;
+  }
+  align_buffer_page_end(dest_opt_pixels, kSize);
+  align_buffer_page_end(dest_c_pixels, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);  // Disable all CPU optimization.
+  ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_,
+             dest_c_pixels, benchmark_height_, benchmark_height_,
+             benchmark_width_, kFilterBox);
+  MaskCpuFlags(benchmark_cpu_info_);  // Enable all CPU optimization.
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ScalePlane(orig_pixels, benchmark_width_, benchmark_width_,
+               benchmark_height_, dest_opt_pixels, benchmark_height_,
+               benchmark_height_, benchmark_width_, kFilterBox);
+  }
+
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+  }
+
+  free_aligned_buffer_page_end(dest_c_pixels);
+  free_aligned_buffer_page_end(dest_opt_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
+  align_buffer_page_end(orig_pixels, 3);
+  align_buffer_page_end(dst_pixels, 3);
+
+  // Pad the 1x1 byte image with invalid values before and after in case libyuv
+  // reads outside the memory boundaries.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 1;  // scale this pixel
+  orig_pixels[2] = 2;
+  dst_pixels[0] = 3;
+  dst_pixels[1] = 3;
+  dst_pixels[2] = 3;
+
+  libyuv::ScalePlane(orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+                     /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+                     /* dst_width= */ 1, /* dst_height= */ 2,
+                     libyuv::kFilterBox);
+
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
+
+  free_aligned_buffer_page_end(dst_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
+  align_buffer_page_end(orig_pixels_alloc, 3 * 2);
+  align_buffer_page_end(dst_pixels_alloc, 3 * 2);
+  uint16_t* orig_pixels = (uint16_t*)orig_pixels_alloc;
+  uint16_t* dst_pixels = (uint16_t*)dst_pixels_alloc;
+
+  // Pad the 1x1 byte image with invalid values before and after in case libyuv
+  // reads outside the memory boundaries.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 1;  // scale this pixel
+  orig_pixels[2] = 2;
+  dst_pixels[0] = 3;
+  dst_pixels[1] = 3;
+  dst_pixels[2] = 3;
+
+  libyuv::ScalePlane_16(
+      orig_pixels + 1, /* src_stride= */ 1, /* src_width= */ 1,
+      /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
+      /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);
+
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
+
+  free_aligned_buffer_page_end(dst_pixels_alloc);
+  free_aligned_buffer_page_end(orig_pixels_alloc);
+}
+}  // namespace libyuv
diff --git a/unit_test/scale_rgb_test.cc b/unit_test/scale_rgb_test.cc
new file mode 100644
index 00000000..8296abe3
--- /dev/null
+++ b/unit_test/scale_rgb_test.cc
@@ -0,0 +1,280 @@
+/*
+ *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_rgb.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int RGBTestFilter(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         FilterMode f,
+                         int benchmark_iterations,
+                         int disable_cpu_flags,
+                         int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int64_t src_rgb_plane_size =
+      (Abs(src_width) + b * 3) * (Abs(src_height) + b * 3) * 3LL;
+  int src_stride_rgb = (b * 3 + Abs(src_width)) * 3;
+
+  align_buffer_page_end(src_rgb, src_rgb_plane_size);
+  if (!src_rgb) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_rgb, src_rgb_plane_size);
+
+  int64_t dst_rgb_plane_size = (dst_width + b * 3) * (dst_height + b * 3) * 3LL;
+  int dst_stride_rgb = (b * 3 + dst_width) * 3;
+
+  align_buffer_page_end(dst_rgb_c, dst_rgb_plane_size);
+  align_buffer_page_end(dst_rgb_opt, dst_rgb_plane_size);
+  if (!dst_rgb_c || !dst_rgb_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  memset(dst_rgb_c, 2, dst_rgb_plane_size);
+  memset(dst_rgb_opt, 3, dst_rgb_plane_size);
+
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+           src_height, dst_rgb_c + (dst_stride_rgb * b) + b * 3, dst_stride_rgb,
+           dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+           src_height, dst_rgb_opt + (dst_stride_rgb * b) + b * 3,
+           dst_stride_rgb, dst_width, dst_height, f);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+           src_height, dst_rgb_c + (dst_stride_rgb * b) + b * 3, dst_stride_rgb,
+           dst_width, dst_height, f);
+
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    RGBScale(src_rgb + (src_stride_rgb * b) + b * 3, src_stride_rgb, src_width,
+             src_height, dst_rgb_opt + (dst_stride_rgb * b) + b * 3,
+             dst_stride_rgb, dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 3; j < (dst_width + b) * 3; ++j) {
+      int abs_diff = Abs(dst_rgb_c[(i * dst_stride_rgb) + j] -
+                         dst_rgb_opt[(i * dst_stride_rgb) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_rgb_c);
+  free_aligned_buffer_page_end(dst_rgb_opt);
+  free_aligned_buffer_page_end(src_rgb);
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+  TEST_F(LibYUVScaleTest, RGBScaleDownBy##name##_##filter) {                 \
+    int diff = RGBTestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
+        benchmark_cpu_info_);                                                \
+    EXPECT_LE(diff, max_diff);                                               \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom)         \
+  TEST_FACTOR1(name, None, nom, denom, 0)     \
+  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(name, Box, nom, denom, 3)
+#else
+// Test a scale factor with Bilinear.
+#define TEST_FACTOR(name, nom, denom) \
+  TEST_FACTOR1(name, Bilinear, nom, denom, 3)
+#endif
+
+TEST_FACTOR(2, 1, 2)
+#ifndef DISABLE_SLOW_TESTS
+TEST_FACTOR(4, 1, 4)
+// TEST_FACTOR(8, 1, 8)  Disable for benchmark performance.
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#endif
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                 \
+  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+    int diff = RGBTestFilter(benchmark_width_, benchmark_height_, width,     \
+                             height, kFilter##filter, benchmark_iterations_, \
+                             disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                               \
+  }                                                                          \
+  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+    int diff = RGBTestFilter(width, height, Abs(benchmark_width_),           \
+                             Abs(benchmark_height_), kFilter##filter,        \
+                             benchmark_iterations_, disable_cpu_flags_,      \
+                             benchmark_cpu_info_);                           \
+    EXPECT_LE(diff, max_diff);                                               \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)       \
+  TEST_SCALETO1(name, width, height, None, 0)   \
+  TEST_SCALETO1(name, width, height, Linear, 3) \
+  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#endif
+
+TEST_SCALETO(RGBScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(RGBScale, 1, 1)
+TEST_SCALETO(RGBScale, 256, 144) /* 128x72 * 3 */
+TEST_SCALETO(RGBScale, 320, 240)
+TEST_SCALETO(RGBScale, 569, 480)
+TEST_SCALETO(RGBScale, 1280, 720)
+TEST_SCALETO(RGBScale, 1920, 1080)
+#endif  // DISABLE_SLOW_TESTS
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+#define TEST_SCALESWAPXY1(name, filter, max_diff)                      \
+  TEST_F(LibYUVScaleTest, name##SwapXY_##filter) {                     \
+    int diff = RGBTestFilter(benchmark_width_, benchmark_height_,      \
+                             benchmark_height_, benchmark_width_,      \
+                             kFilter##filter, benchmark_iterations_,   \
+                             disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                         \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(RGBScale, None, 0)
+TEST_SCALESWAPXY1(RGBScale, Linear, 0)
+TEST_SCALESWAPXY1(RGBScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(RGBScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
+TEST_F(LibYUVScaleTest, RGBTest3x) {
+  const int kSrcStride = 480 * 3;
+  const int kDstStride = 160 * 3;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 480 * 3; ++i) {
+    orig_pixels[i * 3 + 0] = i;
+    orig_pixels[i * 3 + 1] = 255 - i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+             kFilterBilinear);
+  }
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+  RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+           kFilterNone);
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, RGBTest4x) {
+  const int kSrcStride = 640 * 3;
+  const int kDstStride = 160 * 3;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 640 * 4; ++i) {
+    orig_pixels[i * 3 + 0] = i;
+    orig_pixels[i * 3 + 1] = 255 - i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    RGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+             kFilterBilinear);
+  }
+
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);
+
+  RGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+           kFilterNone);
+
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 811b2d04..6e3b9271 100644
--- a/files/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -14,11 +14,25 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"  // For ScaleRowDown2Box_Odd_C
+#endif
 
 #define STRINGIZE(line) #line
 #define FILELINESTR(file, line) file ":" STRINGIZE(line)
 
+#if defined(__riscv) && !defined(__clang__)
+#define DISABLE_SLOW_TESTS
+#undef ENABLE_FULL_TESTS
+#endif
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
 namespace libyuv {
 
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
@@ -139,6 +153,123 @@ static int I420TestFilter(int src_width,
   return max_diff;
 }
 
+// Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I420TestFilter_12(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_12, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_12, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_12, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_12 = reinterpret_cast<uint16_t*>(src_y_12);
+  uint16_t* p_src_u_12 = reinterpret_cast<uint16_t*>(src_u_12);
+  uint16_t* p_src_v_12 = reinterpret_cast<uint16_t*>(src_v_12);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_12[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_12[i] = src_u[i];
+    p_src_v_12[i] = src_v[i];
+  }
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_12, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_12 = reinterpret_cast<uint16_t*>(dst_y_12);
+  uint16_t* p_dst_u_12 = reinterpret_cast<uint16_t*>(dst_u_12);
+  uint16_t* p_dst_v_12 = reinterpret_cast<uint16_t*>(dst_v_12);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv,
+                 p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12,
+                 dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_12);
+  free_aligned_buffer_page_end(dst_u_12);
+  free_aligned_buffer_page_end(dst_v_12);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_12);
+  free_aligned_buffer_page_end(src_u_12);
+  free_aligned_buffer_page_end(src_v_12);
+
+  return max_diff;
+}
+
 // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
 // 0 = exact.
 static int I420TestFilter_16(int src_width,
@@ -374,6 +505,123 @@ static int I444TestFilter(int src_width,
   return max_diff;
 }
 
+// Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_12(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_12, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_12, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_12, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_12 = reinterpret_cast<uint16_t*>(src_y_12);
+  uint16_t* p_src_u_12 = reinterpret_cast<uint16_t*>(src_u_12);
+  uint16_t* p_src_v_12 = reinterpret_cast<uint16_t*>(src_v_12);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_12[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_12[i] = src_u[i];
+    p_src_v_12[i] = src_v[i];
+  }
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_12, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_12 = reinterpret_cast<uint16_t*>(dst_y_12);
+  uint16_t* p_dst_u_12 = reinterpret_cast<uint16_t*>(dst_u_12);
+  uint16_t* p_dst_v_12 = reinterpret_cast<uint16_t*>(dst_v_12);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv,
+                 p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12,
+                 dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_12);
+  free_aligned_buffer_page_end(dst_u_12);
+  free_aligned_buffer_page_end(dst_v_12);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_12);
+  free_aligned_buffer_page_end(src_u_12);
+  free_aligned_buffer_page_end(src_v_12);
+
+  return max_diff;
+}
+
 // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
 // 0 = exact.
 static int I444TestFilter_16(int src_width,
@@ -491,57 +739,185 @@ static int I444TestFilter_16(int src_width,
   return max_diff;
 }
 
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int NV12TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv * 2;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_uv, src_uv_plane_size);
+  if (!src_y || !src_uv) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_uv, src_uv_plane_size);
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int64_t dst_y_plane_size = (dst_width) * (dst_height);
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv * 2;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+  if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+            dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width,
+            dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height,
+              dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width,
+              dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT.
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference is not
+  //  over 3.
+  int max_diff = 0;
+  for (i = 0; i < (dst_height); ++i) {
+    for (j = 0; j < (dst_width); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = 0; i < (dst_height_uv); ++i) {
+    for (j = 0; j < (dst_width_uv * 2); ++j) {
+      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
+                         dst_uv_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_uv_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_uv_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_uv);
+
+  return max_diff;
+}
+
 // The following adjustments in dimensions ensure the scale factor will be
 // exactly achieved.
 // 2 is chroma subsample.
 #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) {                \
-    int diff = I420TestFilter(                                               \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
-  }                                                                          \
-  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                \
-    int diff = I444TestFilter(                                               \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
-  }                                                                          \
-  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) {           \
-    int diff = I420TestFilter_16(                                            \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
-  }                                                                          \
-  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) {           \
-    int diff = I444TestFilter_16(                                            \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff)           \
+  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) {                 \
+    int diff = I420TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                 \
+    int diff = I444TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_12) { \
+    int diff = I420TestFilter_12(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_12) { \
+    int diff = I444TestFilter_12(                                             \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) {                 \
+    int diff = NV12TestFilter(                                                \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom),  \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
+        benchmark_cpu_info_);                                                 \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff) \
-  TEST_FACTOR1(name, None, nom, denom, 0)      \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)    \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)  \
-  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#ifndef DISABLE_SLOW_TESTS
+#define TEST_FACTOR(name, nom, denom, boxdiff)  \
+  TEST_FACTOR1(, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(, name, Box, nom, denom, boxdiff)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_FACTOR(name, nom, denom, boxdiff)           \
+  TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0)     \
+  TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#else
+#define TEST_FACTOR(name, nom, denom, boxdiff)           \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#endif
+#endif
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
+#ifndef DISABLE_SLOW_TESTS
 TEST_FACTOR(8, 1, 8, 0)
+#endif
 TEST_FACTOR(3by4, 3, 4, 1)
 TEST_FACTOR(3by8, 3, 8, 1)
 TEST_FACTOR(3, 1, 3, 0)
@@ -550,7 +926,7 @@ TEST_FACTOR(3, 1, 3, 0)
 #undef SX
 #undef DX
 
-#define TEST_SCALETO1(name, width, height, filter, max_diff)                  \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff)       \
   TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) {      \
     int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
@@ -563,18 +939,40 @@ TEST_FACTOR(3, 1, 3, 0)
                               disable_cpu_flags_, benchmark_cpu_info_);       \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##To##width##x##height##_##filter##_12) {       \
+    int diff = I420TestFilter_12(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##To##width##x##height##_##filter##_12) {       \
+    int diff = I444TestFilter_12(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##To##width##x##height##_##filter##_16) {       \
     int diff = I420TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##To##width##x##height##_##filter##_16) {       \
     int diff = I444TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
+    int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
   TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) {    \
     int diff = I420TestFilter(width, height, Abs(benchmark_width_),           \
                               Abs(benchmark_height_), kFilter##filter,        \
@@ -590,7 +988,23 @@ TEST_FACTOR(3, 1, 3, 0)
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
-         I420##name##From##width##x##height##_##filter##_16) {                \
+         DISABLED_##I420##name##From##width##x##height##_##filter##_12) {     \
+    int diff = I420TestFilter_12(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I444##name##From##width##x##height##_##filter##_12) {     \
+    int diff = I444TestFilter_12(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest,                                                     \
+         DISABLED_##I420##name##From##width##x##height##_##filter##_16) {     \
     int diff = I420TestFilter_16(width, height, Abs(benchmark_width_),        \
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
@@ -598,307 +1012,122 @@ TEST_FACTOR(3, 1, 3, 0)
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
-         I444##name##From##width##x##height##_##filter##_16) {                \
+         DISABLED_##I444##name##From##width##x##height##_##filter##_16) {     \
     int diff = I444TestFilter_16(width, height, Abs(benchmark_width_),        \
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
     EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) {    \
+    int diff = NV12TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
+#ifndef DISABLE_SLOW_TESTS
 // Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height)         \
-  TEST_SCALETO1(name, width, height, None, 0)     \
-  TEST_SCALETO1(name, width, height, Linear, 3)   \
-  TEST_SCALETO1(name, width, height, Bilinear, 3) \
-  TEST_SCALETO1(name, width, height, Box, 3)
+#define TEST_SCALETO(name, width, height)           \
+  TEST_SCALETO1(, name, width, height, None, 0)     \
+  TEST_SCALETO1(, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(, name, width, height, Box, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+#define TEST_SCALETO(name, width, height)                    \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)     \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#else
+#define TEST_SCALETO(name, width, height)                    \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#endif
+#endif
 
 TEST_SCALETO(Scale, 1, 1)
-TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 1280, 720)
 TEST_SCALETO(Scale, 1920, 1080)
+#endif  // DISABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
-  SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
-  SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
-  SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
-  memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  if (!has_ssse3) {
-    printf("Warning SSSE3 not detected; Skipping test.\n");
-  } else {
-    // TL.
-    orig_pixels[0] = 255u;
-    orig_pixels[1] = 0u;
-    orig_pixels[128 + 0] = 0u;
-    orig_pixels[128 + 1] = 0u;
-    // TR.
-    orig_pixels[2] = 0u;
-    orig_pixels[3] = 100u;
-    orig_pixels[128 + 2] = 0u;
-    orig_pixels[128 + 3] = 0u;
-    // BL.
-    orig_pixels[4] = 0u;
-    orig_pixels[5] = 0u;
-    orig_pixels[128 + 4] = 50u;
-    orig_pixels[128 + 5] = 0u;
-    // BR.
-    orig_pixels[6] = 0u;
-    orig_pixels[7] = 0u;
-    orig_pixels[128 + 6] = 0u;
-    orig_pixels[128 + 7] = 20u;
-    // Odd.
-    orig_pixels[126] = 4u;
-    orig_pixels[127] = 255u;
-    orig_pixels[128 + 126] = 16u;
-    orig_pixels[128 + 127] = 255u;
-
-    // Test regular half size.
-    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(133u, dst_pixels_c[63]);
-
-    // Test Odd width version - Last pixel is just 1 horizontal pixel.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(10u, dst_pixels_c[63]);
-
-    // Test one pixel less, should skip the last pixel.
-    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(0u, dst_pixels_c[63]);
-
-    // Test regular half size SSSE3.
-    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-
-    EXPECT_EQ(64u, dst_pixels_opt[0]);
-    EXPECT_EQ(25u, dst_pixels_opt[1]);
-    EXPECT_EQ(13u, dst_pixels_opt[2]);
-    EXPECT_EQ(5u, dst_pixels_opt[3]);
-    EXPECT_EQ(0u, dst_pixels_opt[4]);
-    EXPECT_EQ(133u, dst_pixels_opt[63]);
-
-    // Compare C and SSSE3 match.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-    for (int i = 0; i < 64; ++i) {
-      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-    }
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_SSSE3
-
-extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t* dst,
-                                    int dst_width);
-extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t* dst,
-                                   int dst_width);
-extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t* dst,
-                                 int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
-  SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]);  // 2 rows + 1 pixel overrun.
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
-  memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
-
-  for (int i = 0; i < 640 * 2 + 1; ++i) {
-    orig_pixels[i] = i;
-  }
-  ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    if (has_neon) {
-      ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    } else {
-      ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    }
-#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-    int has_mmi = TestCpuFlag(kCpuHasMMI);
-    if (has_mmi) {
-      ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    } else {
-      ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-    }
-#else
-    ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
-#endif
-  }
-
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+#define TEST_SCALESWAPXY1(DISABLED_, name, filter, max_diff)               \
+  TEST_F(LibYUVScaleTest, I420##name##SwapXY_##filter) {                   \
+    int diff = I420TestFilter(benchmark_width_, benchmark_height_,         \
+                              benchmark_height_, benchmark_width_,         \
+                              kFilter##filter, benchmark_iterations_,      \
+                              disable_cpu_flags_, benchmark_cpu_info_);    \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) {                   \
+    int diff = I444TestFilter(benchmark_width_, benchmark_height_,         \
+                              benchmark_height_, benchmark_width_,         \
+                              kFilter##filter, benchmark_iterations_,      \
+                              disable_cpu_flags_, benchmark_cpu_info_);    \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_12) {   \
+    int diff = I420TestFilter_12(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_12) {   \
+    int diff = I444TestFilter_12(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) {   \
+    int diff = I420TestFilter_16(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) {   \
+    int diff = I444TestFilter_16(benchmark_width_, benchmark_height_,      \
+                                 benchmark_height_, benchmark_width_,      \
+                                 kFilter##filter, benchmark_iterations_,   \
+                                 disable_cpu_flags_, benchmark_cpu_info_); \
+    EXPECT_LE(diff, max_diff);                                             \
+  }                                                                        \
+  TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) {                   \
+    int diff = NV12TestFilter(benchmark_width_, benchmark_height_,         \
+                              benchmark_height_, benchmark_width_,         \
+                              kFilter##filter, benchmark_iterations_,      \
+                              disable_cpu_flags_, benchmark_cpu_info_);    \
+    EXPECT_LE(diff, max_diff);                                             \
   }
-  EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
-  EXPECT_EQ(dst_pixels_c[1279], 800);
-}
 
-extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint16_t* dst,
-                                         int dst_width);
-
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
-  SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
-  SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
-
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
-  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
-
-  for (int i = 0; i < 2560 * 2; ++i) {
-    orig_pixels[i] = i;
-  }
-  ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    if (has_neon) {
-      ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
-    } else {
-      ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
-    }
+// Test scale to a specified size with all 4 filters.
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALESWAPXY1(, Scale, None, 0)
+TEST_SCALESWAPXY1(, Scale, Linear, 3)
+TEST_SCALESWAPXY1(, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(, Scale, Box, 3)
+#else
+#if defined(ENABLE_FULL_TESTS)
+TEST_SCALESWAPXY1(DISABLED_, Scale, None, 0)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Linear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
 #else
-    ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3)
+TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3)
 #endif
-  }
-
-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-  }
-
-  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
-  EXPECT_EQ(dst_pixels_c[1279], 3839);
-}
-
-// Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
-// difference.
-// 0 = exact.
-static int TestPlaneFilter_16(int src_width,
-                              int src_height,
-                              int dst_width,
-                              int dst_height,
-                              FilterMode f,
-                              int benchmark_iterations,
-                              int disable_cpu_flags,
-                              int benchmark_cpu_info) {
-  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
-    return 0;
-  }
-
-  int i;
-  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
-  int src_stride_y = Abs(src_width);
-  int dst_y_plane_size = dst_width * dst_height;
-  int dst_stride_y = dst_width;
-
-  align_buffer_page_end(src_y, src_y_plane_size);
-  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
-  align_buffer_page_end(dst_y_8, dst_y_plane_size);
-  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
-  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
-  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
-
-  MemRandomize(src_y, src_y_plane_size);
-  memset(dst_y_8, 0, dst_y_plane_size);
-  memset(dst_y_16, 1, dst_y_plane_size * 2);
-
-  for (i = 0; i < src_y_plane_size; ++i) {
-    p_src_y_16[i] = src_y[i] & 255;
-  }
-
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
-             dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-
-  for (i = 0; i < benchmark_iterations; ++i) {
-    ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
-                  dst_stride_y, dst_width, dst_height, f);
-  }
-
-  // Expect an exact match.
-  int max_diff = 0;
-  for (i = 0; i < dst_y_plane_size; ++i) {
-    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
-    if (abs_diff > max_diff) {
-      max_diff = abs_diff;
-    }
-  }
-
-  free_aligned_buffer_page_end(dst_y_8);
-  free_aligned_buffer_page_end(dst_y_16);
-  free_aligned_buffer_page_end(src_y);
-  free_aligned_buffer_page_end(src_y_16);
-
-  return max_diff;
-}
-
-// The following adjustments in dimensions ensure the scale factor will be
-// exactly achieved.
-// 2 is chroma subsample.
-#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
-#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
-
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) {          \
-    int diff = TestPlaneFilter_16(                                           \
-        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
-        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
-        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
-        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
-  }
-
-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, nom, denom, boxdiff)      \
-  TEST_FACTOR1(name, None, nom, denom, 0)           \
-  TEST_FACTOR1(name, Linear, nom, denom, boxdiff)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
-  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+#endif
+#undef TEST_SCALESWAPXY1
 
-TEST_FACTOR(2, 1, 2, 0)
-TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
-TEST_FACTOR(3by4, 3, 4, 1)
-TEST_FACTOR(3by8, 3, 8, 1)
-TEST_FACTOR(3, 1, 3, 0)
-#undef TEST_FACTOR1
-#undef TEST_FACTOR
-#undef SX
-#undef DX
 }  // namespace libyuv
diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc
new file mode 100644
index 00000000..dab217c9
--- /dev/null
+++ b/unit_test/scale_uv_test.cc
@@ -0,0 +1,249 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_uv.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
+// SLOW TESTS are those that are unoptimized C code.
+// FULL TESTS are optimized but test many variations of the same code.
+#define ENABLE_FULL_TESTS
+#endif
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int UVTestFilter(int src_width,
+                        int src_height,
+                        int dst_width,
+                        int dst_height,
+                        FilterMode f,
+                        int benchmark_iterations,
+                        int disable_cpu_flags,
+                        int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
+  int src_stride_uv = Abs(src_width) * 2;
+  int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
+  int dst_stride_uv = dst_width * 2;
+
+  align_buffer_page_end(src_uv, src_uv_plane_size);
+  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+
+  if (!src_uv || !dst_uv_c || !dst_uv_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_uv, src_uv_plane_size);
+  memset(dst_uv_c, 2, dst_uv_plane_size);
+  memset(dst_uv_opt, 123, dst_uv_plane_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_c, dst_stride_uv,
+          dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_opt,
+            dst_stride_uv, dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  int max_diff = 0;
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_uv_c);
+  free_aligned_buffer_page_end(dst_uv_opt);
+  free_aligned_buffer_page_end(src_uv);
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom)                               \
+  TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) {                  \
+    int diff = UVTestFilter(                                                 \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
+        benchmark_cpu_info_);                                                \
+    EXPECT_EQ(0, diff);                                                      \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test a scale factor with all 4 filters.  Expect exact for SIMD vs C.
+#define TEST_FACTOR(name, nom, denom)      \
+  TEST_FACTOR1(name, None, nom, denom)     \
+  TEST_FACTOR1(name, Linear, nom, denom)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom) \
+  TEST_FACTOR1(name, Box, nom, denom)
+#else
+// Test a scale factor with Bilinear.
+#define TEST_FACTOR(name, nom, denom) TEST_FACTOR1(name, Bilinear, nom, denom)
+#endif
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+// TEST_FACTOR(8, 1, 8)  Disable for benchmark performance.
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                \
+  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {          \
+    int diff = UVTestFilter(benchmark_width_, benchmark_height_, width,     \
+                            height, kFilter##filter, benchmark_iterations_, \
+                            disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                              \
+  }                                                                         \
+  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {        \
+    int diff = UVTestFilter(width, height, Abs(benchmark_width_),           \
+                            Abs(benchmark_height_), kFilter##filter,        \
+                            benchmark_iterations_, disable_cpu_flags_,      \
+                            benchmark_cpu_info_);                           \
+    EXPECT_LE(diff, max_diff);                                              \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)       \
+  TEST_SCALETO1(name, width, height, None, 0)   \
+  TEST_SCALETO1(name, width, height, Linear, 3) \
+  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#else
+#define TEST_SCALETO(name, width, height) \
+  TEST_SCALETO1(name, width, height, Bilinear, 3)
+#endif
+
+TEST_SCALETO(UVScale, 1, 1)
+TEST_SCALETO(UVScale, 569, 480)
+TEST_SCALETO(UVScale, 640, 360)
+#ifndef DISABLE_SLOW_TESTS
+TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
+TEST_SCALETO(UVScale, 320, 240)
+TEST_SCALETO(UVScale, 1280, 720)
+TEST_SCALETO(UVScale, 1920, 1080)
+#endif  // DISABLE_SLOW_TESTS
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+#define TEST_SCALESWAPXY1(name, filter, max_diff)                              \
+  TEST_F(LibYUVScaleTest, name##SwapXY_##filter) {                             \
+    int diff =                                                                 \
+        UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_,   \
+                     benchmark_width_, kFilter##filter, benchmark_iterations_, \
+                     disable_cpu_flags_, benchmark_cpu_info_);                 \
+    EXPECT_LE(diff, max_diff);                                                 \
+  }
+
+#if defined(ENABLE_FULL_TESTS)
+// Test scale with swapped width and height with all 3 filters.
+TEST_SCALESWAPXY1(UVScale, None, 0)
+TEST_SCALESWAPXY1(UVScale, Linear, 0)
+TEST_SCALESWAPXY1(UVScale, Bilinear, 0)
+#else
+TEST_SCALESWAPXY1(UVScale, Bilinear, 0)
+#endif
+#undef TEST_SCALESWAPXY1
+
+TEST_F(LibYUVScaleTest, UVTest3x) {
+  const int kSrcStride = 480 * 2;
+  const int kDstStride = 160 * 2;
+  const int kSize = kSrcStride * 3;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 480 * 3; ++i) {
+    orig_pixels[i * 2 + 0] = i;
+    orig_pixels[i * 2 + 1] = 255 - i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+            kFilterBilinear);
+  }
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+  UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
+          kFilterNone);
+
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVScaleTest, UVTest4x) {
+  const int kSrcStride = 640 * 2;
+  const int kDstStride = 160 * 2;
+  const int kSize = kSrcStride * 4;
+  align_buffer_page_end(orig_pixels, kSize);
+  for (int i = 0; i < 640 * 4; ++i) {
+    orig_pixels[i * 2 + 0] = i;
+    orig_pixels[i * 2 + 1] = 255 - i;
+  }
+  align_buffer_page_end(dest_pixels, kDstStride);
+
+  int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 *
+                      benchmark_iterations_;
+  for (int i = 0; i < iterations160; ++i) {
+    UVScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
+            kFilterBilinear);
+  }
+
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);
+
+  UVScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
+          kFilterNone);
+
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
+
+  free_aligned_buffer_page_end(dest_pixels);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+}  // namespace libyuv
diff --git a/files/unit_test/testdata/arm_v7.txt b/unit_test/testdata/arm_v7.txt
index 5d7dbd04..5d7dbd04 100644
--- a/files/unit_test/testdata/arm_v7.txt
+++ b/unit_test/testdata/arm_v7.txt
diff --git a/files/unit_test/testdata/juno.txt b/unit_test/testdata/juno.txt
index dd465272..dd465272 100644
--- a/files/unit_test/testdata/juno.txt
+++ b/unit_test/testdata/juno.txt
diff --git a/unit_test/testdata/mips.txt b/unit_test/testdata/mips.txt
new file mode 100644
index 00000000..d9f28cbf
--- /dev/null
+++ b/unit_test/testdata/mips.txt
@@ -0,0 +1,7 @@
+system type             : generic-loongson-machine
+machine                 : loongson,generic
+processor               : 0
+
+isa                     : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented        : vz
+shadow register sets    : 1
diff --git a/unit_test/testdata/mips_loongson2k.txt b/unit_test/testdata/mips_loongson2k.txt
new file mode 100644
index 00000000..8a88d38f
--- /dev/null
+++ b/unit_test/testdata/mips_loongson2k.txt
@@ -0,0 +1,5 @@
+system type		: Loongson2K-SBC
+machine			: loongson,LS2k1000-EVP
+processor		: 0
+cpu model		: Loongson-2K V0.3  FPU V0.1
+BogoMIPS		: 1980.41
diff --git a/unit_test/testdata/mips_loongson3.txt b/unit_test/testdata/mips_loongson3.txt
new file mode 100644
index 00000000..1f540b12
--- /dev/null
+++ b/unit_test/testdata/mips_loongson3.txt
@@ -0,0 +1,10 @@
+system type		: generic-loongson-machine
+machine			: Unknown
+processor		: 0
+cpu model		: ICT Loongson-3 V0.9  FPU V0.1
+model name		: ICT Loongson-3A R3 (Loongson-3A3000) @ 1500MHz
+BogoMIPS		: 2990.15
+
+isa			: mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented	: dsp dsp2 vz
+shadow register sets	: 1
diff --git a/unit_test/testdata/mips_loongson_mmi.txt b/unit_test/testdata/mips_loongson_mmi.txt
new file mode 100644
index 00000000..0f10b8bb
--- /dev/null
+++ b/unit_test/testdata/mips_loongson_mmi.txt
@@ -0,0 +1,7 @@
+system type             : generic-loongson-machine
+machine                 : loongson,generic
+processor               : 0
+
+isa                     : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented        : vz loongson-mmi loongson-ext
+shadow register sets    : 1
diff --git a/unit_test/testdata/mips_msa.txt b/unit_test/testdata/mips_msa.txt
new file mode 100644
index 00000000..ac930615
--- /dev/null
+++ b/unit_test/testdata/mips_msa.txt
@@ -0,0 +1,7 @@
+system type             : generic-loongson-machine
+machine                 : loongson,generic
+processor               : 0
+
+isa                     : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2
+ASEs implemented        : vz msa
+shadow register sets    : 1
diff --git a/unit_test/testdata/riscv64.txt b/unit_test/testdata/riscv64.txt
new file mode 100644
index 00000000..fbb4200f
--- /dev/null
+++ b/unit_test/testdata/riscv64.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imac
+mmu             : sv48
+\ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv.txt b/unit_test/testdata/riscv64_rvv.txt
new file mode 100644
index 00000000..af1b3f36
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv
+mmu             : sv48
+\ No newline at end of file
diff --git a/unit_test/testdata/riscv64_rvv_zvfh.txt b/unit_test/testdata/riscv64_rvv_zvfh.txt
new file mode 100644
index 00000000..c416c1af
--- /dev/null
+++ b/unit_test/testdata/riscv64_rvv_zvfh.txt
@@ -0,0 +1,4 @@
+processor       : 0
+hart            : 1
+isa             : rv64imafdcv_zfh_zvfh
+mmu             : sv48
+\ No newline at end of file
diff --git a/files/unit_test/testdata/tegra3.txt b/unit_test/testdata/tegra3.txt
index d1b09f6b..d1b09f6b 100644
--- a/files/unit_test/testdata/tegra3.txt
+++ b/unit_test/testdata/tegra3.txt
diff --git a/files/unit_test/testdata/test0.jpg b/unit_test/testdata/test0.jpg
index f4461a81..f4461a81 100644
--- a/files/unit_test/testdata/test0.jpg
+++ b/unit_test/testdata/test0.jpg
diff --git a/files/unit_test/testdata/test1.jpg b/unit_test/testdata/test1.jpg
index a0210e9d..a0210e9d 100644
--- a/files/unit_test/testdata/test1.jpg
+++ b/unit_test/testdata/test1.jpg
diff --git a/files/unit_test/testdata/test2.jpg b/unit_test/testdata/test2.jpg
index 816ca767..816ca767 100644
--- a/files/unit_test/testdata/test2.jpg
+++ b/unit_test/testdata/test2.jpg
diff --git a/files/unit_test/testdata/test3.jpg b/unit_test/testdata/test3.jpg
index 792d91dc..792d91dc 100644
--- a/files/unit_test/testdata/test3.jpg
+++ b/unit_test/testdata/test3.jpg
diff --git a/files/unit_test/testdata/test4.jpg b/unit_test/testdata/test4.jpg
index 1ef41668..1ef41668 100644
--- a/files/unit_test/testdata/test4.jpg
+++ b/unit_test/testdata/test4.jpg
diff --git a/files/unit_test/unit_test.cc b/unit_test/unit_test.cc
index a1ae7ea3..239d5b92 100644
--- a/files/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -14,23 +14,28 @@
 
 #include <cstring>
 
-#ifdef LIBYUV_USE_GFLAGS
-#include "gflags/gflags.h"
+#ifdef LIBYUV_USE_ABSL_FLAGS
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
 #endif
 #include "libyuv/cpu_id.h"
 
 unsigned int fastrand_seed = 0xfb;
 
-#ifdef LIBYUV_USE_GFLAGS
-DEFINE_int32(libyuv_width, 0, "width of test image.");
-DEFINE_int32(libyuv_height, 0, "height of test image.");
-DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
-DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD");
-DEFINE_int32(libyuv_cpu_info,
-             0,
-             "cpu flags for benchmark code. 1 = C, -1 = SIMD");
+#ifdef LIBYUV_USE_ABSL_FLAGS
+ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image.");
+ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image.");
+ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test.");
+ABSL_FLAG(int32_t,
+          libyuv_flags,
+          0,
+          "cpu flags for reference code. 1 = C, -1 = SIMD");
+ABSL_FLAG(int32_t,
+          libyuv_cpu_info,
+          0,
+          "cpu flags for benchmark code. 1 = C, -1 = SIMD");
 #else
-// Disable command line parameters if gflags disabled.
+// Disable command line parameters if absl/flags disabled.
 static const int32_t FLAGS_libyuv_width = 0;
 static const int32_t FLAGS_libyuv_height = 0;
 static const int32_t FLAGS_libyuv_repeat = 0;
@@ -38,6 +43,12 @@ static const int32_t FLAGS_libyuv_flags = 0;
 static const int32_t FLAGS_libyuv_cpu_info = 0;
 #endif
 
+#ifdef LIBYUV_USE_ABSL_FLAGS
+#define LIBYUV_GET_FLAG(f) absl::GetFlag(f)
+#else
+#define LIBYUV_GET_FLAG(f) f
+#endif
+
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
 #if !defined(__native_client__) && !defined(_M_ARM)
@@ -66,8 +77,20 @@ int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_MSA")) {
     cpu_info &= ~libyuv::kCpuHasMSA;
   }
-  if (TestEnv("LIBYUV_DISABLE_MMI")) {
-    cpu_info &= ~libyuv::kCpuHasMMI;
+#endif
+#if defined(__longarch__) && defined(__linux__)
+  if (TestEnv("LIBYUV_DISABLE_LSX")) {
+    cpu_info &= ~libyuv::kCpuHasLSX;
+  }
+#endif
+#if defined(__longarch__) && defined(__linux__)
+  if (TestEnv("LIBYUV_DISABLE_LASX")) {
+    cpu_info &= ~libyuv::kCpuHasLASX;
+  }
+#endif
+#if defined(__riscv) && defined(__linux__)
+  if (TestEnv("LIBYUV_DISABLE_RVV")) {
+    cpu_info &= ~libyuv::kCpuHasRVV;
   }
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
@@ -109,6 +132,9 @@ int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_AVX512VL")) {
     cpu_info &= ~libyuv::kCpuHasAVX512VL;
   }
+  if (TestEnv("LIBYUV_DISABLE_AVX512VNNI")) {
+    cpu_info &= ~libyuv::kCpuHasAVX512VNNI;
+  }
   if (TestEnv("LIBYUV_DISABLE_AVX512VBMI")) {
     cpu_info &= ~libyuv::kCpuHasAVX512VBMI;
   }
@@ -118,11 +144,14 @@ int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_AVX512VBITALG")) {
     cpu_info &= ~libyuv::kCpuHasAVX512VBITALG;
   }
-  if (TestEnv("LIBYUV_DISABLE_AVX512VPOPCNTDQ")) {
-    cpu_info &= ~libyuv::kCpuHasAVX512VPOPCNTDQ;
+  if (TestEnv("LIBYUV_DISABLE_AVX10")) {
+    cpu_info &= ~libyuv::kCpuHasAVX10;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVXVNNI")) {
+    cpu_info &= ~libyuv::kCpuHasAVXVNNI;
   }
-  if (TestEnv("LIBYUV_DISABLE_GFNI")) {
-    cpu_info &= ~libyuv::kCpuHasGFNI;
+  if (TestEnv("LIBYUV_DISABLE_AVXVNNIINT8")) {
+    cpu_info &= ~libyuv::kCpuHasAVXVNNIINT8;
   }
 #endif
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
@@ -145,8 +174,8 @@ LibYUVConvertTest::LibYUVConvertTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -156,29 +185,29 @@ LibYUVConvertTest::LibYUVConvertTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -201,8 +230,8 @@ LibYUVColorTest::LibYUVColorTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -212,29 +241,29 @@ LibYUVColorTest::LibYUVColorTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -257,8 +286,8 @@ LibYUVScaleTest::LibYUVScaleTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -268,29 +297,29 @@ LibYUVScaleTest::LibYUVScaleTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -313,8 +342,8 @@ LibYUVRotateTest::LibYUVRotateTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -324,29 +353,29 @@ LibYUVRotateTest::LibYUVRotateTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -369,8 +398,8 @@ LibYUVPlanarTest::LibYUVPlanarTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -380,29 +409,29 @@ LibYUVPlanarTest::LibYUVPlanarTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -425,8 +454,8 @@ LibYUVBaseTest::LibYUVBaseTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -436,29 +465,29 @@ LibYUVBaseTest::LibYUVBaseTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -481,8 +510,8 @@ LibYUVCompareTest::LibYUVCompareTest()
   if (repeat) {
     benchmark_iterations_ = atoi(repeat);  // NOLINT
   }
-  if (FLAGS_libyuv_repeat) {
-    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) {
+    benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat);
   }
   if (benchmark_iterations_ > 1) {
     benchmark_width_ = 1280;
@@ -492,29 +521,29 @@ LibYUVCompareTest::LibYUVCompareTest()
   if (width) {
     benchmark_width_ = atoi(width);  // NOLINT
   }
-  if (FLAGS_libyuv_width) {
-    benchmark_width_ = FLAGS_libyuv_width;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) {
+    benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width);
   }
   const char* height = getenv("LIBYUV_HEIGHT");
   if (height) {
     benchmark_height_ = atoi(height);  // NOLINT
   }
-  if (FLAGS_libyuv_height) {
-    benchmark_height_ = FLAGS_libyuv_height;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) {
+    benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height);
   }
   const char* cpu_flags = getenv("LIBYUV_FLAGS");
   if (cpu_flags) {
     disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_flags) {
-    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) {
+    disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags);
   }
   const char* cpu_info = getenv("LIBYUV_CPU_INFO");
   if (cpu_info) {
     benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
   }
-  if (FLAGS_libyuv_cpu_info) {
-    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) {
+    benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info);
   }
   disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
   benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
@@ -529,11 +558,8 @@ LibYUVCompareTest::LibYUVCompareTest()
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-#ifdef LIBYUV_USE_GFLAGS
-  // AllowCommandLineParsing allows us to ignore flags passed on to us by
-  // Chromium build bots without having to explicitly disable them.
-  google::AllowCommandLineReparsing();
-  google::ParseCommandLineFlags(&argc, &argv, true);
+#ifdef LIBYUV_USE_ABSL_FLAGS
+  absl::ParseCommandLine(argc, argv);
 #endif
   return RUN_ALL_TESTS();
 }
diff --git a/files/unit_test/unit_test.h b/unit_test/unit_test.h
index 87907fa1..99cc8d19 100644
--- a/files/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -11,10 +11,10 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
-#ifdef WIN32
+#include <stddef.h>  // For NULL
+#ifdef _WIN32
 #include <windows.h>
 #else
-#include <sys/resource.h>
 #include <sys/time.h>
 #endif
 
@@ -77,7 +77,18 @@ static inline bool SizeValid(int src_width,
 
 #define free_aligned_buffer_page_end(var) \
   free(var##_mem);                        \
-  var = 0
+  var = NULL
+
+#define align_buffer_page_end_16(var, size)                                 \
+  uint8_t* var##_mem =                                                      \
+      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
+  uint16_t* var = reinterpret_cast<uint16_t*>(                              \
+      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
+      ~63)
+
+#define free_aligned_buffer_page_end_16(var) \
+  free(var##_mem);                           \
+  var = NULL
 
 #ifdef WIN32
 static inline double get_time() {
@@ -111,10 +122,13 @@ inline int fastrand() {
   return static_cast<int>((fastrand_seed >> 16) & 0xffff);
 }
 
+// ubsan fails if dst is unaligned unless we use uint8
 static inline void MemRandomize(uint8_t* dst, int64_t len) {
   int64_t i;
   for (i = 0; i < len - 1; i += 2) {
-    *reinterpret_cast<uint16_t*>(dst) = fastrand();
+    int r = fastrand();
+    dst[0] = static_cast<uint8_t>(r);
+    dst[1] = static_cast<uint8_t>(r >> 8);
     dst += 2;
   }
   for (; i < len; ++i) {
diff --git a/files/unit_test/video_common_test.cc b/unit_test/video_common_test.cc
index a84206a2..36728ea9 100644
--- a/files/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@@ -29,7 +29,7 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) {
       !TestValidChar((fourcc >> 24) & 0xff)) {
     return false;
   }
-  if (bpp < 0 || bpp > 32) {
+  if (bpp < 0 || bpp > 64) {
     return false;
   }
   return true;
@@ -65,13 +65,15 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
   EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
   EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
-  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
   EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
   EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
   EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
   EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
   EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
   EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
   EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
   EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
   EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
@@ -81,6 +83,11 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
   EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
   EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
   EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
   EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
diff --git a/files/util/Makefile b/util/Makefile
index 40e74b65..40e74b65 100644
--- a/files/util/Makefile
+++ b/util/Makefile
diff --git a/util/color.cc b/util/color.cc
new file mode 100644
index 00000000..8c3bbefd
--- /dev/null
+++ b/util/color.cc
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3]
+// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3]
+// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3]
+// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3]
+// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709      KR = 0.2126; KB = 0.0722
+// 4 fcc         KR = 0.30;   KB = 0.11
+// 6 bt.601      KR = 0.299;  KB = 0.114
+// 7 SMPTE 240M  KR = 0.212;  KB = 0.087
+// 10 bt2020     KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+  return (int)(v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+  if (argc < 2) {
+    printf("color kr kb\n");
+    return -1;
+  }
+  float kr = atof(argv[1]);
+  float kb = atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("Limited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  return 0;
+}
diff --git a/files/util/compare.cc b/util/compare.cc
index a16613ee..a16613ee 100644
--- a/files/util/compare.cc
+++ b/util/compare.cc
diff --git a/files/util/cpuid.c b/util/cpuid.c
index 84c06022..c07e6e95 100644
--- a/files/util/cpuid.c
+++ b/util/cpuid.c
@@ -12,16 +12,19 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define INCLUDE_LIBYUV_COMPARE_H_
-#include "libyuv.h"
-#include "./psnr.h"
-#include "./ssim.h"
+#include "libyuv/cpu_id.h"
+
+#ifdef __cplusplus
+using namespace libyuv;
+#endif
 
 int main(int argc, const char* argv[]) {
   int cpu_flags = TestCpuFlag(-1);
   int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
   int has_x86 = TestCpuFlag(kCpuHasX86);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   (void)argc;
   (void)argv;
 
@@ -60,19 +63,28 @@ int main(int argc, const char* argv[]) {
            model, model);
   }
 #endif
-  printf("Cpu Flags %x\n", cpu_flags);
-  printf("Has ARM %x\n", has_arm);
-  printf("Has MIPS %x\n", has_mips);
-  printf("Has X86 %x\n", has_x86);
+  printf("Cpu Flags 0x%x\n", cpu_flags);
   if (has_arm) {
     int has_neon = TestCpuFlag(kCpuHasNEON);
-    printf("Has NEON %x\n", has_neon);
+    printf("Has ARM 0x%x\n", has_arm);
+    printf("Has NEON 0x%x\n", has_neon);
+  }
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
   }
   if (has_mips) {
     int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MSA %x\n", has_msa);
-    int has_mmi = TestCpuFlag(kCpuHasMMI);
-    printf("Has MMI %x\n", has_mmi);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
   }
   if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -83,30 +95,35 @@ int main(int argc, const char* argv[]) {
     int has_avx2 = TestCpuFlag(kCpuHasAVX2);
     int has_erms = TestCpuFlag(kCpuHasERMS);
     int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-    int has_f16c = TestCpuFlag(kCpuHasF16C); 
-    int has_gfni = TestCpuFlag(kCpuHasGFNI);
+    int has_f16c = TestCpuFlag(kCpuHasF16C);
     int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
     int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+    int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
     int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
     int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
     int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-    int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
-    printf("Has SSE2 %x\n", has_sse2);
-    printf("Has SSSE3 %x\n", has_ssse3);
-    printf("Has SSE4.1 %x\n", has_sse41);
-    printf("Has SSE4.2 %x\n", has_sse42);
-    printf("Has AVX %x\n", has_avx);
-    printf("Has AVX2 %x\n", has_avx2);
-    printf("Has ERMS %x\n", has_erms);
-    printf("Has FMA3 %x\n", has_fma3);
-    printf("Has F16C %x\n", has_f16c);
-    printf("Has GFNI %x\n", has_gfni);
-    printf("Has AVX512BW %x\n", has_avx512bw);
-    printf("Has AVX512VL %x\n", has_avx512vl);
-    printf("Has AVX512VBMI %x\n", has_avx512vbmi);
-    printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
-    printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
-    printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
+    int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX10 0x%x\n", has_avx10);
+    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
   }
   return 0;
 }
diff --git a/util/i444tonv12_eg.cc b/util/i444tonv12_eg.cc
new file mode 100644
index 00000000..0fcb4095
--- /dev/null
+++ b/util/i444tonv12_eg.cc
@@ -0,0 +1,28 @@
+
+#include "libyuv/convert.h"
+
+#include <stdio.h>   // for printf
+#include <string.h>  // for memset
+
+int main(int, char**) {
+  unsigned char src_i444[640 * 400 * 3];
+  unsigned char dst_nv12[640 * 400 * 3 / 2];
+
+  for (size_t i = 0; i < sizeof(src_i444); ++i) {
+    src_i444[i] = i & 255;
+  }
+  memset(dst_nv12, 0, sizeof(dst_nv12));
+  libyuv::I444ToNV12(&src_i444[0], 640,              // source Y
+                     &src_i444[640 * 400], 640,      // source U
+                     &src_i444[640 * 400 * 2], 640,  // source V
+                     &dst_nv12[0], 640,              // dest Y
+                     &dst_nv12[640 * 400], 640,      // dest UV
+                     640, 400);                      // width and height
+
+  int checksum = 0;
+  for (size_t i = 0; i < sizeof(dst_nv12); ++i) {
+    checksum += dst_nv12[i];
+  }
+  printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL");
+  return 0;
+}
+\ No newline at end of file
diff --git a/files/util/psnr.cc b/util/psnr.cc
index c7bee7f9..c7bee7f9 100644
--- a/files/util/psnr.cc
+++ b/util/psnr.cc
diff --git a/files/util/psnr.h b/util/psnr.h
index aac128cb..aac128cb 100644
--- a/files/util/psnr.h
+++ b/util/psnr.h
diff --git a/files/util/psnr_main.cc b/util/psnr_main.cc
index a930b202..8b9fd972 100644
--- a/files/util/psnr_main.cc
+++ b/util/psnr_main.cc
@@ -248,13 +248,13 @@ bool UpdateMetrics(uint8_t* ch_org,
                    int number_of_frames,
                    metric* cur_distortion_psnr,
                    metric* distorted_frame,
-                   bool do_psnr) {
+                   bool compute_psnr) {
   const int uv_offset = (do_swap_uv ? uv_size : 0);
   const uint8_t* const u_org = ch_org + y_size + uv_offset;
   const uint8_t* const u_rec = ch_rec + y_size;
   const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset);
   const uint8_t* const v_rec = ch_rec + y_size + uv_size;
-  if (do_psnr) {
+  if (compute_psnr) {
 #ifdef HAVE_JPEG
     double y_err = static_cast<double>(
         libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
diff --git a/files/util/ssim.cc b/util/ssim.cc
index 096fbcf0..096fbcf0 100644
--- a/files/util/ssim.cc
+++ b/util/ssim.cc
diff --git a/files/util/ssim.h b/util/ssim.h
index a855f1d1..a855f1d1 100644
--- a/files/util/ssim.h
+++ b/util/ssim.h
diff --git a/util/yuvconstants.c b/util/yuvconstants.c
new file mode 100644
index 00000000..4e5185af
--- /dev/null
+++ b/util/yuvconstants.c
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// See Also
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int main(int argc, const char* argv[]) {
+  if (argc < 3) {
+    printf("yuvconstants [KR] [KB]\n");
+    printf("  e.g. yuvconstants 0.2126 0.0722\n");
+    printf("  MC BT          KR           KB\n");
+    printf("  1  BT.709      KR = 0.2126; KB = 0.0722\n");
+    printf("  4  FCC         KR = 0.30;   KB = 0.11\n");
+    printf("  6  BT.601      KR = 0.299;  KB = 0.114\n");
+    printf("  7  SMPTE 240M  KR = 0.212;  KB = 0.087\n");
+    printf("  9  BT.2020     KR = 0.2627; KB = 0.0593\n");
+    return -1;
+  }
+  float kr = (float)atof(argv[1]);
+  float kb = (float)atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+  // printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("\nLimited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+  // printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
+
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+
+  return 0;
+}
diff --git a/files/util/yuvconvert.cc b/util/yuvconvert.cc
index 27cdfe9e..93b52668 100644
--- a/files/util/yuvconvert.cc
+++ b/util/yuvconvert.cc
@@ -42,9 +42,9 @@ static __inline uint32_t Abs(int32_t v) {
 }
 
 // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
-bool ExtractResolutionFromFilename(const char* name,
-                                   int* width_ptr,
-                                   int* height_ptr) {
+static bool ExtractResolutionFromFilename(const char* name,
+                                          int* width_ptr,
+                                          int* height_ptr) {
   // Isolate the .width_height. section of the filename by searching for a
   // dot or underscore followed by a digit.
   for (int i = 0; name[i]; ++i) {
@@ -59,7 +59,7 @@ bool ExtractResolutionFromFilename(const char* name,
   return false;
 }
 
-void PrintHelp(const char* program) {
+static void PrintHelp(const char* program) {
   printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
   printf(
       " -s <width> <height> .... specify source resolution.  "
@@ -78,7 +78,7 @@ void PrintHelp(const char* program) {
   exit(0);
 }
 
-void ParseOptions(int argc, const char* argv[]) {
+static void ParseOptions(int argc, const char* argv[]) {
   if (argc <= 1) {
     PrintHelp(argv[0]);
   }
@@ -165,23 +165,23 @@ static int TileARGBScale(const uint8_t* src_argb,
                          int src_height,
                          uint8_t* dst_argb,
                          int dst_stride_argb,
-                         int dst_width,
-                         int dst_height,
+                         int destination_width,
+                         int destination_height,
                          libyuv::FilterMode filtering) {
-  for (int y = 0; y < dst_height; y += kTileY) {
-    for (int x = 0; x < dst_width; x += kTileX) {
+  for (int y = 0; y < destination_height; y += kTileY) {
+    for (int x = 0; x < destination_width; x += kTileX) {
       int clip_width = kTileX;
-      if (x + clip_width > dst_width) {
-        clip_width = dst_width - x;
+      if (x + clip_width > destination_width) {
+        clip_width = destination_width - x;
       }
       int clip_height = kTileY;
-      if (y + clip_height > dst_height) {
-        clip_height = dst_height - y;
+      if (y + clip_height > destination_height) {
+        clip_height = destination_height - y;
       }
       int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width,
                                     src_height, dst_argb, dst_stride_argb,
-                                    dst_width, dst_height, x, y, clip_width,
-                                    clip_height, filtering);
+                                    destination_width, destination_height, x, y,
+                                    clip_width, clip_height, filtering);
       if (r) {
         return r;
       }
diff --git a/files/winarm.mk b/winarm.mk
index c4307a43..b0a344ae 100644
--- a/files/winarm.mk
+++ b/winarm.mk
@@ -31,6 +31,7 @@ LOCAL_OBJ_FILES = \
 	source/scale_any.o\
 	source/scale_argb.o\
 	source/scale_common.o\
+	source/scale_uv.o\
 	source/video_common.o
 
 .cc.o: